aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_log_recover.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
-rw-r--r--fs/xfs/xfs_log_recover.c1635
1 files changed, 1194 insertions, 441 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 15ff5392fb6..981af0f6504 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,37 +17,49 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
#include "xfs_quota.h"
-#include "xfs_rw.h"
-#include "xfs_utils.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
-STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
-STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
+
+STATIC int
+xlog_find_zeroed(
+ struct xlog *,
+ xfs_daddr_t *);
+STATIC int
+xlog_clear_stale_blocks(
+ struct xlog *,
+ xfs_lsn_t);
#if defined(DEBUG)
-STATIC void xlog_recover_check_summary(xlog_t *);
+STATIC void
+xlog_recover_check_summary(
+ struct xlog *);
#else
#define xlog_recover_check_summary(log)
#endif
@@ -75,7 +87,7 @@ struct xfs_buf_cancel {
static inline int
xlog_buf_bbcount_valid(
- xlog_t *log,
+ struct xlog *log,
int bbcount)
{
return bbcount > 0 && bbcount <= log->l_logBBsize;
@@ -88,7 +100,7 @@ xlog_buf_bbcount_valid(
*/
STATIC xfs_buf_t *
xlog_get_bp(
- xlog_t *log,
+ struct xlog *log,
int nbblks)
{
struct xfs_buf *bp;
@@ -120,7 +132,7 @@ xlog_get_bp(
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0);
+ bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
if (bp)
xfs_buf_unlock(bp);
return bp;
@@ -139,14 +151,14 @@ xlog_put_bp(
*/
STATIC xfs_caddr_t
xlog_align(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(offset + nbblks <= bp->b_length);
return bp->b_addr + BBTOB(offset);
}
@@ -156,10 +168,10 @@ xlog_align(
*/
STATIC int
xlog_bread_noalign(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
@@ -174,13 +186,17 @@ xlog_bread_noalign(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_READ(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
- xfsbdstrat(log->l_mp, bp);
+ if (XFS_FORCED_SHUTDOWN(log->l_mp))
+ return XFS_ERROR(EIO);
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error)
xfs_buf_ioerror_alert(bp, __func__);
@@ -189,10 +205,10 @@ xlog_bread_noalign(
STATIC int
xlog_bread(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_caddr_t *offset)
{
int error;
@@ -211,14 +227,14 @@ xlog_bread(
*/
STATIC int
xlog_bread_offset(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no, /* block to read from */
int nbblks, /* blocks to read */
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_caddr_t offset)
{
xfs_caddr_t orig_offset = bp->b_addr;
- int orig_len = bp->b_buffer_length;
+ int orig_len = BBTOB(bp->b_length);
int error, error2;
error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
@@ -241,10 +257,10 @@ xlog_bread_offset(
*/
STATIC int
xlog_bwrite(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
@@ -259,13 +275,14 @@ xlog_bwrite(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_ZEROFLAGS(bp);
xfs_buf_hold(bp);
xfs_buf_lock(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
error = xfs_bwrite(bp);
if (error)
@@ -283,9 +300,9 @@ xlog_header_check_dump(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
- xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, " log : uuid = %pU, fmt = %d",
&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
}
#else
@@ -377,8 +394,8 @@ xlog_recover_iodone(
*/
STATIC int
xlog_find_cycle_start(
- xlog_t *log,
- xfs_buf_t *bp,
+ struct xlog *log,
+ struct xfs_buf *bp,
xfs_daddr_t first_blk,
xfs_daddr_t *last_blk,
uint cycle)
@@ -420,7 +437,7 @@ xlog_find_cycle_start(
*/
STATIC int
xlog_find_verify_cycle(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
int nbblks,
uint stop_on_cycle_no,
@@ -440,6 +457,8 @@ xlog_find_verify_cycle(
* a log sector, or we're out of luck.
*/
bufblks = 1 << ffs(nbblks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
@@ -487,7 +506,7 @@ out:
*/
STATIC int
xlog_find_verify_log_record(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
xfs_daddr_t *last_blk,
int extra_bblks)
@@ -584,7 +603,7 @@ out:
/*
* Head is defined to be the point of the log where the next log write
- * write could go. This means that incomplete LR writes at the end are
+ * could go. This means that incomplete LR writes at the end are
* eliminated when calculating the head. We aren't guaranteed that previous
* LR have complete transactions. We only know that a cycle number of
* current cycle number -1 won't be present in the log if we start writing
@@ -597,7 +616,7 @@ out:
*/
STATIC int
xlog_find_head(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
@@ -868,7 +887,7 @@ validate_head:
*/
STATIC int
xlog_find_tail(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *head_blk,
xfs_daddr_t *tail_blk)
{
@@ -940,6 +959,7 @@ xlog_find_tail(
}
if (!found) {
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ xlog_put_bp(bp);
ASSERT(0);
return XFS_ERROR(EIO);
}
@@ -965,9 +985,9 @@ xlog_find_tail(
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
/*
@@ -1077,7 +1097,7 @@ done:
*/
STATIC int
xlog_find_zeroed(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
@@ -1121,7 +1141,8 @@ xlog_find_zeroed(
*/
xfs_warn(log->l_mp,
"Log inconsistent or not a log (last==0, first!=1)");
- return XFS_ERROR(EINVAL);
+ error = XFS_ERROR(EINVAL);
+ goto bp_err;
}
/* we have a partially zeroed log */
@@ -1180,7 +1201,7 @@ bp_err:
*/
STATIC void
xlog_add_record(
- xlog_t *log,
+ struct xlog *log,
xfs_caddr_t buf,
int cycle,
int block,
@@ -1202,7 +1223,7 @@ xlog_add_record(
STATIC int
xlog_write_log_records(
- xlog_t *log,
+ struct xlog *log,
int cycle,
int start_block,
int blocks,
@@ -1225,6 +1246,8 @@ xlog_write_log_records(
* log sector, or we're out of luck.
*/
bufblks = 1 << ffs(blocks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
@@ -1300,7 +1323,7 @@ xlog_write_log_records(
*/
STATIC int
xlog_clear_stale_blocks(
- xlog_t *log,
+ struct xlog *log,
xfs_lsn_t tail_lsn)
{
int tail_cycle, head_cycle;
@@ -1427,9 +1450,8 @@ xlog_recover_find_tid(
xlog_tid_t tid)
{
xlog_recover_t *trans;
- struct hlist_node *n;
- hlist_for_each_entry(trans, n, head, r_list) {
+ hlist_for_each_entry(trans, head, r_list) {
if (trans->r_log_tid == tid)
return trans;
}
@@ -1466,8 +1488,8 @@ xlog_recover_add_item(
STATIC int
xlog_recover_add_to_cont_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1512,8 +1534,8 @@ xlog_recover_add_to_cont_trans(
*/
STATIC int
xlog_recover_add_to_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1558,6 +1580,7 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
+ kmem_free(ptr);
return XFS_ERROR(EIO);
}
@@ -1576,32 +1599,89 @@ xlog_recover_add_to_trans(
}
/*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ * 1. Every item is only logged once in a given transaction. Hence it
+ * represents the last logged state of the item. Hence ordering is
+ * dependent on the order in which operations need to be performed so
+ * required initial conditions are always met.
+ *
+ * 2. Cancelled buffers are recorded in pass 1 in a separate table and
+ * there's nothing to replay from them so we can simply cull them
+ * from the transaction. However, we can't do that until after we've
+ * replayed all the other items because they may be dependent on the
+ * cancelled buffer and replaying the cancelled buffer can remove it
+ * form the cancelled buffer table. Hence they have tobe done last.
+ *
+ * 3. Inode allocation buffers must be replayed before inode items that
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
+ *
+ * 4. Inode unlink buffers must be replayed after inode items are replayed.
+ * This ensures that inodes are completely flushed to the inode buffer
+ * in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
int pass)
{
xlog_recover_item_t *item, *n;
+ int error = 0;
LIST_HEAD(sort_list);
+ LIST_HEAD(cancel_list);
+ LIST_HEAD(buffer_list);
+ LIST_HEAD(inode_buffer_list);
+ LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
- list_move(&item->ri_list, &trans->r_itemq);
+ list_move(&item->ri_list, &cancel_list);
+ break;
+ }
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ list_move(&item->ri_list, &inode_buffer_list);
break;
}
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
@@ -1609,18 +1689,34 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
- list_move_tail(&item->ri_list, &trans->r_itemq);
+ list_move_tail(&item->ri_list, &inode_list);
break;
default:
xfs_warn(log->l_mp,
"%s: unrecognized type of log operation",
__func__);
ASSERT(0);
- return XFS_ERROR(EIO);
+ /*
+ * return the remaining items back to the transaction
+ * item list so they can be freed in caller.
+ */
+ if (!list_empty(&sort_list))
+ list_splice_init(&sort_list, &trans->r_itemq);
+ error = XFS_ERROR(EIO);
+ goto out;
}
}
+out:
ASSERT(list_empty(&sort_list));
- return 0;
+ if (!list_empty(&buffer_list))
+ list_splice(&buffer_list, &trans->r_itemq);
+ if (!list_empty(&inode_list))
+ list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&inode_buffer_list))
+ list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+ if (!list_empty(&cancel_list))
+ list_splice_tail(&cancel_list, &trans->r_itemq);
+ return error;
}
/*
@@ -1637,8 +1733,8 @@ xlog_recover_reorder_trans(
*/
STATIC int
xlog_recover_buffer_pass1(
- struct log *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
struct list_head *bucket;
@@ -1678,20 +1774,12 @@ xlog_recover_buffer_pass1(
/*
* Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table. If it does then return 1
- * so that it will be cancelled, otherwise return 0. If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
*/
-STATIC int
-xlog_check_buffer_cancelled(
- struct log *log,
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
+ struct xlog *log,
xfs_daddr_t blkno,
uint len,
ushort flags)
@@ -1699,22 +1787,16 @@ xlog_check_buffer_cancelled(
struct list_head *bucket;
struct xfs_buf_cancel *bcp;
- if (log->l_buf_cancel_table == NULL) {
- /*
- * There is nothing in the table built in pass one,
- * so this buffer must not be cancelled.
- */
+ if (!log->l_buf_cancel_table) {
+ /* empty table means no cancelled buffers in the log */
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
}
- /*
- * Search for an entry in the cancel table that matches our buffer.
- */
bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
list_for_each_entry(bcp, bucket, bc_list) {
if (bcp->bc_blkno == blkno && bcp->bc_len == len)
- goto found;
+ return bcp;
}
/*
@@ -1722,9 +1804,32 @@ xlog_check_buffer_cancelled(
* that the buffer is NOT cancelled.
*/
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0. If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len,
+ ushort flags)
+{
+ struct xfs_buf_cancel *bcp;
+
+ bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+ if (!bcp)
+ return 0;
-found:
/*
* We've go a match, so return 1 so that the recovery of this buffer
* is cancelled. If this buffer is actually a buffer cancel log
@@ -1772,7 +1877,14 @@ xlog_recover_do_inode_buffer(
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1814,7 +1926,8 @@ xlog_recover_do_inode_buffer(
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <=
+ BBTOB(bp->b_io_length));
/*
* The current logged region contains a copy of the
@@ -1836,12 +1949,361 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
}
return 0;
}
/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number. If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+ void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
+
+ /* v4 filesystems always recover immediately */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto recover_immediately;
+
+ magic32 = be32_to_cpu(*(__be32 *)blk);
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
+ case XFS_AGF_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
+ case XFS_AGFL_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
+ case XFS_AGI_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
+ case XFS_SYMLINK_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
+ case XFS_DIR3_BLOCK_MAGIC:
+ case XFS_DIR3_DATA_MAGIC:
+ case XFS_DIR3_FREE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
+ case XFS_ATTR3_RMT_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+ uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+ break;
+ case XFS_SB_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+ switch (magicda) {
+ case XFS_DIR3_LEAF1_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ /*
+ * We do individual object checks on dquot and inode buffers as they
+ * have their own individual LSN records. Also, we could have a stale
+ * buffer here, so we have to at least recognise these buffer types.
+ *
+ * A notd complexity here is inode unlinked list processing - it logs
+ * the inode directly in the buffer, but we don't know which inodes have
+ * been modified, and there is no global buffer LSN. Hence we need to
+ * recover all inode buffer types immediately. This problem will be
+ * fixed by logical logging of the unlinked list modifications.
+ */
+ magic16 = be16_to_cpu(*(__be16 *)blk);
+ switch (magic16) {
+ case XFS_DQUOT_MAGIC:
+ case XFS_DINODE_MAGIC:
+ goto recover_immediately;
+ default:
+ break;
+ }
+
+ /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+ return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_allocbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Bad btree block magic!");
+ ASSERT(0);
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ xfs_warn(mp, "Bad AGF block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_AGFL_MAGIC) {
+ xfs_warn(mp, "Bad AGFL block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ xfs_warn(mp, "Bad AGI block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ xfs_warn(mp, "Bad DQUOT block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ /*
+ * we get here with inode allocation buffers, not buffers that
+ * track unlinked list changes.
+ */
+ if (magic16 != XFS_DINODE_MAGIC) {
+ xfs_warn(mp, "Bad INODE block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ xfs_warn(mp, "Bad symlink block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ xfs_warn(mp, "Bad dir block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ xfs_warn(mp, "Bad dir data magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ xfs_warn(mp, "Bad dir3 free magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ xfs_warn(mp, "Bad dir leaf1 magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ xfs_warn(mp, "Bad dir leafn magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ xfs_warn(mp, "Bad da node magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ xfs_warn(mp, "Bad attr leaf magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ xfs_warn(mp, "Bad attr remote magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ xfs_warn(mp, "Bad SB block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+}
+
+/*
* Perform a 'normal' buffer recovery. Each logged region of the
* buffer should be copied over the corresponding region in the
* given buffer. The bitmap in the buf log format structure indicates
@@ -1873,8 +2335,19 @@ xlog_recover_do_reg_buffer(
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(XFS_BUF_COUNT(bp) >=
- ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+ ASSERT(BBTOB(bp->b_io_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+ /*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -1895,7 +2368,7 @@ xlog_recover_do_reg_buffer(
item->ri_buf[i].i_len, __func__);
goto next;
}
- error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+ error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
-1, 0, XFS_QMOPT_DOWARN,
"dquot_buf_recover");
if (error)
@@ -1913,142 +2386,32 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
-}
-
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_qm_dqcheck(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags,
- char *str)
-{
- xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
- int errs = 0;
-
- /*
- * We can encounter an uninitialized dquot buffer for 2 reasons:
- * 1. If we crash while deleting the quotainode(s), and those blks got
- * used for user data. This is because we take the path of regular
- * file deletion; however, the size field of quotainodes is never
- * updated, so all the tricks that we play in itruncate_finish
- * don't quite matter.
- *
- * 2. We don't play the quota buffers when there's a quotaoff logitem.
- * But the allocation will be replayed so we'll end up with an
- * uninitialized quota block.
- *
- * This is all fine; things are still consistent, and we haven't lost
- * any quota information. Just don't complain about bad dquot blks.
- */
- if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
- str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
- errs++;
- }
- if (ddq->d_version != XFS_DQUOT_VERSION) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
- str, id, ddq->d_version, XFS_DQUOT_VERSION);
- errs++;
- }
-
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
- str, id, ddq->d_flags);
- errs++;
- }
-
- if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : ondisk-dquot 0x%p, ID mismatch: "
- "0x%x expected, found id 0x%x",
- str, ddq, id, be32_to_cpu(ddq->d_id));
- errs++;
- }
-
- if (!errs && ddq->d_id) {
- if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
- be64_to_cpu(ddq->d_blk_softlimit)) {
- if (!ddq->d_btimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
- be64_to_cpu(ddq->d_ino_softlimit)) {
- if (!ddq->d_itimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
- be64_to_cpu(ddq->d_rtb_softlimit)) {
- if (!ddq->d_rtbtimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- }
-
- if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
- return errs;
-
- if (flags & XFS_QMOPT_DOWARN)
- xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
/*
- * Typically, a repair is only requested by quotacheck.
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
*/
- ASSERT(id != -1);
- ASSERT(flags & XFS_QMOPT_DQREPAIR);
- memset(d, 0, sizeof(xfs_dqblk_t));
-
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
-
- return errs;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
}
/*
* Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
* (ie. USR or GRP), then just toss this buffer away; don't recover it.
* Else, treat it as a regular buffer and do recovery.
*/
STATIC void
xlog_recover_do_dquot_buffer(
- xfs_mount_t *mp,
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_buf_t *bp,
- xfs_buf_log_format_t *buf_f)
+ struct xfs_mount *mp,
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
{
uint type;
@@ -2097,19 +2460,22 @@ xlog_recover_do_dquot_buffer(
* over the log during recovery. During the first we build a table of
* those buffers which have been cancelled, and during the second we
* only replay those buffers which do not have corresponding cancel
- * records in the table. See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table. See xlog_recover_buffer_pass[1,2] above
* for more details on the implementation of the table of cancel records.
*/
STATIC int
xlog_recover_buffer_pass2(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
int error;
uint buf_flags;
+ xfs_lsn_t lsn;
/*
* In this pass we only want to recover all the buffers which have
@@ -2123,21 +2489,28 @@ xlog_recover_buffer_pass2(
trace_xfs_log_recover_buf_recover(log, buf_f);
- buf_flags = XBF_LOCK;
- if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
- buf_flags |= XBF_MAPPED;
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags);
+ buf_flags, NULL);
if (!bp)
return XFS_ERROR(ENOMEM);
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
- xfs_buf_relse(bp);
- return error;
+ goto out_release;
}
+ /*
+ * recover the buffer only if we get an LSN from it and it's less than
+ * the lsn of the transaction we are replaying.
+ */
+ lsn = xlog_recover_get_buf_lsn(mp, bp);
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+ goto out_release;
+
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
} else if (buf_f->blf_flags &
@@ -2147,7 +2520,7 @@ xlog_recover_buffer_pass2(
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
if (error)
- return XFS_ERROR(error);
+ goto out_release;
/*
* Perform delayed write on the buffer. Asynchronous writes will be
@@ -2155,35 +2528,114 @@ xlog_recover_buffer_pass2(
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
+ * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
- * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+ * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+ * for *our* value of mp->m_inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
- (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+ (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
+ (__uint32_t)log->l_mp->m_inode_cluster_size))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
}
+out_release:
xfs_buf_relse(bp);
return error;
}
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return ENOMEM;
+
+ /* instantiate the inode */
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ ASSERT(ip->i_d.di_version >= 3);
+
+ error = xfs_iformat_fork(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
STATIC int
xlog_recover_inode_pass2(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_inode_log_format_t *in_f;
xfs_mount_t *mp = log->l_mp;
@@ -2196,6 +2648,7 @@ xlog_recover_inode_pass2(
int attr_index;
uint fields;
xfs_icdinode_t *dicp;
+ uint isize;
int need_free = 0;
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
@@ -2220,8 +2673,8 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
- XBF_LOCK);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+ &xfs_inode_buf_ops);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2229,8 +2682,7 @@ xlog_recover_inode_pass2(
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
- xfs_buf_relse(bp);
- goto error;
+ goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2240,29 +2692,52 @@ xlog_recover_inode_pass2(
* like an inode!
*/
if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
dicp = item->ri_buf[1].i_addr;
if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
__func__, item, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
+ }
+
+ /*
+ * If the inode has an LSN in it, recover the inode only if it's less
+ * than the lsn of the transaction we are replaying. Note: we still
+ * need to replay an owner change even though the inode is more recent
+ * than the transaction as there is no guarantee that all the btree
+ * blocks are more recent than this transaction, too.
+ */
+ if (dip->di_version >= 3) {
+ xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_owner_change;
+ }
}
- /* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
@@ -2271,12 +2746,12 @@ xlog_recover_inode_pass2(
dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
- xfs_buf_relse(bp);
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
- goto error;
+ goto out_release;
}
}
+
/* Take the opportunity to reset the flush iteration count */
dicp->di_flushiter = 0;
@@ -2285,13 +2760,12 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
} else if (unlikely(S_ISDIR(dicp->di_mode))) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2299,19 +2773,17 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
}
if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2319,38 +2791,37 @@ xlog_recover_inode_pass2(
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
- if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+ isize = xfs_icdinode_size(dicp->di_version);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
/* The core is in in-core format */
- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, dicp);
/* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
- memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
+ if (item->ri_buf[1].i_len > isize) {
+ memcpy((char *)dip + isize,
+ item->ri_buf[1].i_addr + isize,
+ item->ri_buf[1].i_len - isize);
}
fields = in_f->ilf_fields;
@@ -2366,7 +2837,7 @@ xlog_recover_inode_pass2(
}
if (in_f->ilf_size == 2)
- goto write_inode_buffer;
+ goto out_owner_change;
len = item->ri_buf[2].i_len;
src = item->ri_buf[2].i_addr;
ASSERT(in_f->ilf_size <= 4);
@@ -2427,16 +2898,23 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- xfs_buf_relse(bp);
error = EIO;
- goto error;
+ goto out_release;
}
}
-write_inode_buffer:
+out_owner_change:
+ if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
xfs_buf_relse(bp);
error:
if (need_free)
@@ -2445,14 +2923,14 @@ error:
}
/*
- * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
* structure, so that we know not to do any dquot item or dquot buffer recovery,
* of that type.
*/
STATIC int
xlog_recover_quotaoff_pass1(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
ASSERT(qoff_f);
@@ -2476,8 +2954,10 @@ xlog_recover_quotaoff_pass1(
*/
STATIC int
xlog_recover_dquot_pass2(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
@@ -2524,20 +3004,18 @@ xlog_recover_dquot_pass2(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2 (log copy)");
if (error)
return XFS_ERROR(EIO);
ASSERT(dq_f->qlf_len == 1);
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len),
- 0, &bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ NULL);
+ if (error)
return error;
- }
+
ASSERT(bp);
ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
@@ -2546,22 +3024,40 @@ xlog_recover_dquot_pass2(
* was among a chunk of dquots created earlier, and we did some
* minimal initialization then.
*/
- error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2");
if (error) {
xfs_buf_relse(bp);
return XFS_ERROR(EIO);
}
+ /*
+ * If the dquot has an LSN in it, recover the dquot only if it's less
+ * than the lsn of the transaction we are replaying.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+ xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ goto out_release;
+ }
+ }
+
memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
- xfs_buf_relse(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
- return (0);
+out_release:
+ xfs_buf_relse(bp);
+ return 0;
}
/*
@@ -2573,9 +3069,9 @@ xlog_recover_dquot_pass2(
*/
STATIC int
xlog_recover_efi_pass2(
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_lsn_t lsn)
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
{
int error;
xfs_mount_t *mp = log->l_mp;
@@ -2611,8 +3107,8 @@ xlog_recover_efi_pass2(
*/
STATIC int
xlog_recover_efd_pass2(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
@@ -2642,7 +3138,8 @@ xlog_recover_efd_pass2(
* xfs_trans_ail_delete() drops the
* AIL lock.
*/
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip,
+ SHUTDOWN_CORRUPT_INCORE);
xfs_efi_item_free(efip);
spin_lock(&ailp->xa_lock);
break;
@@ -2650,13 +3147,100 @@ xlog_recover_efd_pass2(
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return 0;
}
/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
+
+ /* existing allocation is fixed value */
+ ASSERT(count == mp->m_ialloc_inos);
+ ASSERT(length == mp->m_ialloc_blks);
+ if (count != mp->m_ialloc_inos ||
+ length != mp->m_ialloc_blks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
+ }
+
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
+ return 0;
+}
+
+/*
* Free up any resources allocated by the transaction
*
* Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -2681,11 +3265,111 @@ xlog_recover_free_trans(
kmem_free(trans);
}
+STATIC void
+xlog_recover_buffer_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_mount *mp = log->l_mp;
+
+ if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ return;
+ }
+
+ xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+ buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_inode_log_format ilf_buf;
+ struct xfs_inode_log_format *ilfp;
+ struct xfs_mount *mp = log->l_mp;
+ int error;
+
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ ilfp = item->ri_buf[0].i_addr;
+ } else {
+ ilfp = &ilf_buf;
+ memset(ilfp, 0, sizeof(*ilfp));
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+ if (error)
+ return;
+ }
+
+ if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+ ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_disk_dquot *recddq;
+ struct xfs_dq_logformat *dq_f;
+ uint type;
+
+
+ if (mp->m_qflags == 0)
+ return;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL)
+ return;
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ return;
+
+ type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ ASSERT(dq_f->qlf_len == 1);
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ xlog_recover_buffer_ra_pass2(log, item);
+ break;
+ case XFS_LI_INODE:
+ xlog_recover_inode_ra_pass2(log, item);
+ break;
+ case XFS_LI_DQUOT:
+ xlog_recover_dquot_ra_pass2(log, item);
+ break;
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_QUOTAOFF:
+ default:
+ break;
+ }
+}
+
STATIC int
xlog_recover_commit_pass1(
- struct log *log,
- struct xlog_recover *trans,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct xlog_recover_item *item)
{
trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
@@ -2698,6 +3382,7 @@ xlog_recover_commit_pass1(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
/* nothing to do in pass 1 */
return 0;
default:
@@ -2710,23 +3395,29 @@ xlog_recover_commit_pass1(
STATIC int
xlog_recover_commit_pass2(
- struct log *log,
- struct xlog_recover *trans,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
{
trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, item);
+ return xlog_recover_buffer_pass2(log, buffer_list, item,
+ trans->r_lsn);
case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, item);
+ return xlog_recover_inode_pass2(log, buffer_list, item,
+ trans->r_lsn);
case XFS_LI_EFI:
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
case XFS_LI_EFD:
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, item);
+ return xlog_recover_dquot_pass2(log, buffer_list, item,
+ trans->r_lsn);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
@@ -2738,6 +3429,26 @@ xlog_recover_commit_pass2(
}
}
+STATIC int
+xlog_recover_items_pass2(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct list_head *item_list)
+{
+ struct xlog_recover_item *item;
+ int error = 0;
+
+ list_for_each_entry(item, item_list, ri_list) {
+ error = xlog_recover_commit_pass2(log, trans,
+ buffer_list, item);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
/*
* Perform the transaction.
*
@@ -2746,12 +3457,20 @@ xlog_recover_commit_pass2(
*/
STATIC int
xlog_recover_commit_trans(
- struct log *log,
+ struct xlog *log,
struct xlog_recover *trans,
int pass)
{
- int error = 0;
- xlog_recover_item_t *item;
+ int error = 0;
+ int error2;
+ int items_queued = 0;
+ struct xlog_recover_item *item;
+ struct xlog_recover_item *next;
+ LIST_HEAD (buffer_list);
+ LIST_HEAD (ra_list);
+ LIST_HEAD (done_list);
+
+ #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
hlist_del(&trans->r_list);
@@ -2759,23 +3478,51 @@ xlog_recover_commit_trans(
if (error)
return error;
- list_for_each_entry(item, &trans->r_itemq, ri_list) {
- if (pass == XLOG_RECOVER_PASS1)
+ list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+ switch (pass) {
+ case XLOG_RECOVER_PASS1:
error = xlog_recover_commit_pass1(log, trans, item);
- else
- error = xlog_recover_commit_pass2(log, trans, item);
+ break;
+ case XLOG_RECOVER_PASS2:
+ xlog_recover_ra_pass2(log, item);
+ list_move_tail(&item->ri_list, &ra_list);
+ items_queued++;
+ if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
+ items_queued = 0;
+ }
+
+ break;
+ default:
+ ASSERT(0);
+ }
+
if (error)
- return error;
+ goto out;
+ }
+
+out:
+ if (!list_empty(&ra_list)) {
+ if (!error)
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
}
+ if (!list_empty(&done_list))
+ list_splice_init(&done_list, &trans->r_itemq);
+
xlog_recover_free_trans(trans);
- return 0;
+
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+ return error ? error : error2;
}
STATIC int
xlog_recover_unmount_trans(
- struct log *log,
- xlog_recover_t *trans)
+ struct xlog *log)
{
/* Do nothing now */
xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -2793,9 +3540,9 @@ xlog_recover_unmount_trans(
*/
STATIC int
xlog_recover_process_data(
- xlog_t *log,
+ struct xlog *log,
struct hlist_head rhash[],
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
int pass)
{
@@ -2849,7 +3596,7 @@ xlog_recover_process_data(
trans, pass);
break;
case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(log, trans);
+ error = xlog_recover_unmount_trans(log);
break;
case XLOG_WAS_CONT_TRANS:
error = xlog_recover_add_to_cont_trans(log,
@@ -2874,8 +3621,10 @@ xlog_recover_process_data(
error = XFS_ERROR(EIO);
break;
}
- if (error)
+ if (error) {
+ xlog_recover_free_trans(trans);
return error;
+ }
}
dp += be32_to_cpu(ohead->oh_len);
num_logops--;
@@ -2918,13 +3667,14 @@ xlog_recover_process_efi(
* This will pull the EFI from the AIL and
* free the memory associated with it.
*/
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip, efip->efi_format.efi_nextents);
return XFS_ERROR(EIO);
}
}
tp = xfs_trans_alloc(mp, 0);
- error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto abort_error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -2967,7 +3717,7 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- xlog_t *log)
+ struct xlog *log)
{
xfs_log_item_t *lip;
xfs_efi_log_item_t *efip;
@@ -3008,7 +3758,7 @@ xlog_recover_process_efis(
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return error;
}
@@ -3030,8 +3780,7 @@ xlog_recover_clear_agi_bucket(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
- 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
if (error)
goto out_abort;
@@ -3079,7 +3828,7 @@ xlog_recover_process_one_iunlink(
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
if (error)
goto fail_iput;
@@ -3128,7 +3877,7 @@ xlog_recover_process_one_iunlink(
*/
STATIC void
xlog_recover_process_iunlinks(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agnumber_t agno;
@@ -3161,116 +3910,83 @@ xlog_recover_process_iunlinks(
*/
continue;
}
+ /*
+ * Unlock the buffer so that it can be acquired in the normal
+ * course of the transaction to truncate and free each inode.
+ * Because we are not racing with anyone else here for the AGI
+ * buffer, we don't even need to hold it locked to read the
+ * initial unlinked bucket entries out of the buffer. We keep
+ * buffer reference though, so that it stays pinned in memory
+ * while we need the buffer.
+ */
agi = XFS_BUF_TO_AGI(agibp);
+ xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
- /*
- * Release the agi buffer so that it can
- * be acquired in the normal course of the
- * transaction to truncate and free the inode.
- */
- xfs_buf_relse(agibp);
-
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
-
- /*
- * Reacquire the agibuffer and continue around
- * the loop. This should never fail as we know
- * the buffer was good earlier on.
- */
- error = xfs_read_agi(mp, NULL, agno, &agibp);
- ASSERT(error == 0);
- agi = XFS_BUF_TO_AGI(agibp);
}
}
-
- /*
- * Release the buffer for the current agi so we can
- * go on to the next one.
- */
- xfs_buf_relse(agibp);
+ xfs_buf_rele(agibp);
}
mp->m_dmevmask = mp_dmevmask;
}
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int size)
-{
- int i;
- __be32 *up;
- uint chksum = 0;
-
- up = (__be32 *)iclog->ic_datap;
- /* divide length by 4 to get # words */
- for (i = 0; i < (size >> 2); i++) {
- chksum ^= be32_to_cpu(*up);
- up++;
- }
- iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
/*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
*/
-void
-xlog_pack_data(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int roundoff)
+STATIC int
+xlog_unpack_data_crc(
+ struct xlog_rec_header *rhead,
+ xfs_caddr_t dp,
+ struct xlog *log)
{
- int i, j, k;
- int size = iclog->ic_offset + roundoff;
- __be32 cycle_lsn;
- xfs_caddr_t dp;
-
- xlog_pack_data_checksum(log, iclog, size);
-
- cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
- dp = iclog->ic_datap;
- for (i = 0; i < BTOBB(size) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = iclog->ic_data;
-
- for ( ; i < BTOBB(size); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
+ __le32 crc;
+
+ crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ if (crc != rhead->h_crc) {
+ if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ xfs_alert(log->l_mp,
+ "log record CRC mismatch: found 0x%x, expected 0x%x.",
+ le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(crc));
+ xfs_hex_dump(dp, 32);
}
- for (i = 1; i < log->l_iclog_heads; i++) {
- xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
- }
+ /*
+ * If we've detected a log record corruption, then we can't
+ * recover past this point. Abort recovery if we are enforcing
+ * CRC protection by punting an error back up the stack.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ return EFSCORRUPTED;
}
+
+ return 0;
}
-STATIC void
+STATIC int
xlog_unpack_data(
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
- xlog_t *log)
+ struct xlog *log)
{
int i, j, k;
+ int error;
+
+ error = xlog_unpack_data_crc(rhead, dp, log);
+ if (error)
+ return error;
for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3287,12 +4003,14 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
+
+ return 0;
}
STATIC int
xlog_valid_rec_header(
- xlog_t *log,
- xlog_rec_header_t *rhead,
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
xfs_daddr_t blkno)
{
int hlen;
@@ -3335,7 +4053,7 @@ xlog_valid_rec_header(
*/
STATIC int
xlog_do_recovery_pass(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int pass)
@@ -3418,9 +4136,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log,
- rhash, rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log,
+ rhash, rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3525,14 +4247,19 @@ xlog_do_recovery_pass(
* - order is important.
*/
error = xlog_bread_offset(log, 0,
- bblks - split_bblks, hbp,
+ bblks - split_bblks, dbp,
offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
}
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks;
}
@@ -3557,9 +4284,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3587,7 +4318,7 @@ xlog_do_recovery_pass(
*/
STATIC int
xlog_do_log_recovery(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
@@ -3638,7 +4369,7 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
@@ -3650,11 +4381,8 @@ xlog_do_recover(
* First replay the images in the log.
*/
error = xlog_do_log_recovery(log, head_blk, tail_blk);
- if (error) {
+ if (error)
return error;
- }
-
- xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
/*
* If IO errors happened during recovery, bail out.
@@ -3676,15 +4404,21 @@ xlog_do_recover(
/*
* Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock.
+ * updates, re-read in the superblock and reverify it.
*/
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
- xfsbdstrat(log->l_mp, bp);
+ bp->b_ops = &xfs_sb_buf_ops;
+
+ if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_buf_relse(bp);
+ return XFS_ERROR(EIO);
+ }
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
@@ -3717,7 +4451,7 @@ xlog_do_recover(
*/
int
xlog_recover(
- xlog_t *log)
+ struct xlog *log)
{
xfs_daddr_t head_blk, tail_blk;
int error;
@@ -3742,6 +4476,25 @@ xlog_recover(
return error;
}
+ /*
+ * Version 5 superblock log feature mask validation. We know the
+ * log is dirty so check if there are any unknown log features
+ * in what we need to recover. If there are unknown features
+ * (e.g. unsupported transactions, then simply reject the
+ * attempt at recovery before touching anything.
+ */
+ if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+ xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+ (log->l_mp->m_sb.sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ return EINVAL;
+ }
+
xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
log->l_mp->m_logname ? log->l_mp->m_logname
: "internal");
@@ -3763,7 +4516,7 @@ xlog_recover(
*/
int
xlog_recover_finish(
- xlog_t *log)
+ struct xlog *log)
{
/*
* Now we're ready to do the transactions needed for the
@@ -3810,7 +4563,7 @@ xlog_recover_finish(
*/
void
xlog_recover_check_summary(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agf_t *agfp;