aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_log.c')
-rw-r--r--fs/xfs/xfs_log.c540
1 files changed, 412 insertions, 128 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4dad756962d..292308dede6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -17,23 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_log_recover.h"
-#include "xfs_trans_priv.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -118,7 +118,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing);
+ bool syncing);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -255,7 +255,8 @@ xlog_grant_head_wait(
struct xlog *log,
struct xlog_grant_head *head,
struct xlog_ticket *tic,
- int need_bytes)
+ int need_bytes) __releases(&head->lock)
+ __acquires(&head->lock)
{
list_add_tail(&tic->t_queue, &head->waiters);
@@ -458,7 +459,8 @@ xfs_log_reserve(
tic->t_trans_type = t_type;
*ticp = tic;
- xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+ xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+ : tic->t_unit_res);
trace_xfs_log_reserve(log, tic);
@@ -611,13 +613,16 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
- int error;
+ int error = 0;
+ int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
- xfs_notice(mp, "Mounting Filesystem");
- else {
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ xfs_notice(mp, "Mounting V%d Filesystem",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
+ } else {
xfs_notice(mp,
-"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
@@ -628,6 +633,50 @@ xfs_log_mount(
}
/*
+ * Validate the given log space and drop a critical message via syslog
+ * if the log size is too small that would lead to some unexpected
+ * situations in transaction log space reservation stage.
+ *
+ * Note: we can't just reject the mount if the validation fails. This
+ * would mean that people would have to downgrade their kernel just to
+ * remedy the situation as there is no way to grow the log (short of
+ * black magic surgery with xfs_db).
+ *
+ * We can, however, reject mounts for CRC format filesystems, as the
+ * mkfs binary being used to make the filesystem should never create a
+ * filesystem with a log that is too small.
+ */
+ min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+ if (mp->m_sb.sb_logblocks < min_logfsbs) {
+ xfs_warn(mp,
+ "Log size %d blocks too small, minimum size is %d blocks",
+ mp->m_sb.sb_logblocks, min_logfsbs);
+ error = EINVAL;
+ } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+ xfs_warn(mp,
+ "Log size %d blocks too large, maximum size is %lld blocks",
+ mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+ error = EINVAL;
+ } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+ xfs_warn(mp,
+ "log size %lld bytes too large, maximum size is %lld bytes",
+ XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+ XFS_MAX_LOG_BYTES);
+ error = EINVAL;
+ }
+ if (error) {
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+ ASSERT(0);
+ goto out_free_log;
+ }
+ xfs_crit(mp,
+"Log size out of supported range. Continuing onwards, but if log hangs are\n"
+"experienced then please report this message in the bug report.");
+ }
+
+ /*
* Initialize the AIL now we have a log.
*/
error = xfs_trans_ail_init(mp);
@@ -679,25 +728,29 @@ out:
}
/*
- * Finish the recovery of the file system. This is separate from
- * the xfs_log_mount() call, because it depends on the code in
- * xfs_mountfs() to read in the root and real-time bitmap inodes
- * between calling xfs_log_mount() and here.
+ * Finish the recovery of the file system. This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
*
- * mp - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
*/
int
xfs_log_mount_finish(xfs_mount_t *mp)
{
- int error;
+ int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
error = xlog_recover_finish(mp->m_log);
- else {
- error = 0;
+ if (!error)
+ xfs_log_work_queue(mp);
+ } else {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
+
return error;
}
@@ -713,7 +766,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
* Unmount record used to have a string "Unmount filesystem--" in the
* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
* We just write the magic number now since that particular field isn't
- * currently architecture converted and "nUmount" is a bit foo.
+ * currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
@@ -850,15 +903,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} /* xfs_log_unmount_write */
/*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+ struct xfs_mount *mp)
+{
+ cancel_delayed_work_sync(&mp->m_log->l_work);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+ * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * xfs_buf_iowait() cannot be used because it was pushed with the
+ * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+ * the IO to complete.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_unlock(mp->m_sb_bp);
+
+ xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
*
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
*/
void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_unmount(
+ struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_sync_work);
+ xfs_log_quiesce(mp);
+
xfs_trans_ail_destroy(mp);
xlog_dealloc_log(mp->m_log);
}
@@ -913,27 +1000,34 @@ xfs_log_space_wake(
}
/*
- * Determine if we have a transaction that has gone to disk
- * that needs to be covered. To begin the transition to the idle state
- * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * If we are then in a state where covering is needed, the caller is informed
- * that dummy transactions are required to move the log into the idle state.
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
*
- * Because this is called as part of the sync process, we should also indicate
- * that dummy transactions should be issued in anything but the covered or
- * idle states. This ensures that the log tail is accurately reflected in
- * the log at the end of the sync, hence if a crash occurrs avoids replay
- * of transactions where the metadata is already on disk.
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL. Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
*/
int
xfs_log_need_covered(xfs_mount_t *mp)
{
- int needed = 0;
struct xlog *log = mp->m_log;
+ int needed = 0;
if (!xfs_fs_writable(mp))
return 0;
+ if (!xlog_cil_empty(log))
+ return 0;
+
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
case XLOG_STATE_COVER_DONE:
@@ -942,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp)
break;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
- if (!xfs_ail_min_lsn(log->l_ailp) &&
- xlog_iclogs_empty(log)) {
- if (log->l_covered_state == XLOG_STATE_COVER_NEED)
- log->l_covered_state = XLOG_STATE_COVER_DONE;
- else
- log->l_covered_state = XLOG_STATE_COVER_DONE2;
- }
- /* FALLTHRU */
+ if (xfs_ail_min_lsn(log->l_ailp))
+ break;
+ if (!xlog_iclogs_empty(log))
+ break;
+
+ needed = 1;
+ if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+ log->l_covered_state = XLOG_STATE_COVER_DONE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_DONE2;
+ break;
default:
needed = 1;
break;
@@ -981,6 +1078,7 @@ xlog_assign_tail_lsn_locked(
tail_lsn = lip->li_lsn;
else
tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ trace_xfs_log_assign_tail_lsn(log, tail_lsn);
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
@@ -1067,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp)
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
@@ -1085,13 +1183,15 @@ xlog_iodone(xfs_buf_t *bp)
/* log I/O is always issued ASYNC */
ASSERT(XFS_BUF_ISASYNC(bp));
xlog_state_done_syncing(iclog, aborted);
+
/*
- * do not reference the buffer (bp) here as we could race
- * with it being freed after writing the unmount record to the
- * log.
+ * drop the buffer lock now that we are done. Nothing references
+ * the buffer after this, so an unmount waiting on this lock can now
+ * tear it down safely. As such, it is unsafe to reference the buffer
+ * (bp) after the unlock as we could race with it being freed.
*/
-
-} /* xlog_iodone */
+ xfs_buf_unlock(bp);
+}
/*
* Return size of each in-core log record buffer.
@@ -1161,6 +1261,40 @@ done:
} /* xlog_get_iclog_buffer_size */
+void
+xfs_log_work_queue(
+ struct xfs_mount *mp)
+{
+ queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+ struct work_struct *work)
+{
+ struct xlog *log = container_of(to_delayed_work(work),
+ struct xlog, l_work);
+ struct xfs_mount *mp = log->l_mp;
+
+ /* dgc: errors ignored - not fatal and nowhere to report them */
+ if (xfs_log_need_covered(mp))
+ xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+
+ /* start pushing all the metadata that is currently dirty */
+ xfs_ail_push_all(mp->m_ail);
+
+ /* queue us up again */
+ xfs_log_work_queue(mp);
+}
+
/*
* This routine initializes some of the log structure for a given mount point.
* Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1329,7 @@ xlog_alloc_log(
log->l_logBBsize = num_bblks;
log->l_covered_state = XLOG_STATE_COVER_IDLE;
log->l_flags |= XLOG_ACTIVE_RECOVERY;
+ INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1238,8 +1373,16 @@ xlog_alloc_log(
bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
- bp->b_iodone = xlog_iodone;
+
+ /*
+ * The iclogbuf buffer locks are held over IO but we are not going to do
+ * IO yet. Hence unlock the buffer so that the log IO path can grab it
+ * when appropriately.
+ */
ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
+ bp->b_iodone = xlog_iodone;
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
@@ -1268,6 +1411,9 @@ xlog_alloc_log(
if (!bp)
goto out_free_iclog;
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
bp->b_iodone = xlog_iodone;
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
@@ -1292,7 +1438,6 @@ xlog_alloc_log(
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
- ASSERT(xfs_buf_islocked(iclog->ic_bp));
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1417,12 +1562,96 @@ xlog_grant_push_ail(
}
/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int roundoff)
+{
+ int i, j, k;
+ int size = iclog->ic_offset + roundoff;
+ __be32 cycle_lsn;
+ xfs_caddr_t dp;
+
+ cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+ dp = iclog->ic_datap;
+ for (i = 0; i < BTOBB(size); i++) {
+ if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+ break;
+ iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+ for ( ; i < BTOBB(size); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ for (i = 1; i < log->l_iclog_heads; i++)
+ xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+ }
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
+ char *dp,
+ int size)
+{
+ __uint32_t crc;
+
+ /* first generate the crc for the record header ... */
+ crc = xfs_start_cksum((char *)rhead,
+ sizeof(struct xlog_rec_header),
+ offsetof(struct xlog_rec_header, h_crc));
+
+ /* ... then for additional cycle data for v2 logs ... */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+ int i;
+
+ for (i = 1; i < log->l_iclog_heads; i++) {
+ crc = crc32c(crc, &xhdr[i].hic_xheader,
+ sizeof(struct xlog_rec_ext_header));
+ }
+ }
+
+ /* ... and finally for the payload */
+ crc = crc32c(crc, dp, size);
+
+ return xfs_end_cksum(crc);
+}
+
+/*
* The bdstrat callback function for log bufs. This gives us a central
* place to trap bufs in case we get hit by a log I/O error and need to
* shutdown. Actually, in practice, even when we didn't get a log error,
* we transition the iclogs to IOERROR state *after* flushing all existing
* iclogs to disk. This is because we don't want anymore new transactions to be
* started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
*/
STATIC int
xlog_bdstrat(
@@ -1430,6 +1659,7 @@ xlog_bdstrat(
{
struct xlog_in_core *iclog = bp->b_fspriv;
+ xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
xfs_buf_stale(bp);
@@ -1437,7 +1667,8 @@ xlog_bdstrat(
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here.
+ * doing it here. Similarly, IO completion will unlock the
+ * buffer, so we don't do it here.
*/
return 0;
}
@@ -1476,7 +1707,6 @@ xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_caddr_t dptr; /* pointer to byte sized element */
xfs_buf_t *bp;
int i;
uint count; /* byte count of bwrite */
@@ -1485,6 +1715,7 @@ xlog_sync(
int split = 0; /* split write into two regions */
int error;
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+ int size;
XFS_STATS_INC(xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1746,10 @@ xlog_sync(
xlog_pack_data(log, iclog, roundoff);
/* real byte length */
- if (v2) {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset + roundoff);
- } else {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset);
- }
+ size = iclog->ic_offset;
+ if (v2)
+ size += roundoff;
+ iclog->ic_header.h_len = cpu_to_be32(size);
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1758,36 @@ xlog_sync(
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+ char *dptr;
+
split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2; /* split into 2 writes */
+ iclog->ic_bwritecnt = 2;
+
+ /*
+ * Bump the cycle numbers at the start of each block in the
+ * part of the iclog that ends up in the buffer that gets
+ * written to the start of the log.
+ *
+ * Watch out for the header magic number case, though.
+ */
+ dptr = (char *)&iclog->ic_header + count;
+ for (i = 0; i < split; i += BBSIZE) {
+ __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ *(__be32 *)dptr = cpu_to_be32(cycle);
+
+ dptr += BBSIZE;
+ }
} else {
iclog->ic_bwritecnt = 1;
}
+
+ /* calculcate the checksum */
+ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+ iclog->ic_datap, size);
+
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
XFS_BUF_ZEROFLAGS(bp);
@@ -1563,7 +1815,7 @@ xlog_sync(
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
- xlog_verify_iclog(log, iclog, count, B_TRUE);
+ xlog_verify_iclog(log, iclog, count, true);
/* account for log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
@@ -1589,19 +1841,6 @@ xlog_sync(
bp->b_flags |= XBF_SYNCIO;
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
- dptr = bp->b_addr;
- /*
- * Bump the cycle numbers at the start of each block
- * since this part of the buffer is at the start of
- * a new cycle. Watch out for the header magic number
- * case, though.
- */
- for (i = 0; i < split; i += BBSIZE) {
- be32_add_cpu((__be32 *)dptr, 1);
- if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
- be32_add_cpu((__be32 *)dptr, 1);
- dptr += BBSIZE;
- }
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1857,6 @@ xlog_sync(
return 0;
} /* xlog_sync */
-
/*
* Deallocate a log structure
*/
@@ -1632,14 +1870,28 @@ xlog_dealloc_log(
xlog_cil_destroy(log);
/*
- * always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it.
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
*/
+ iclog = log->l_iclog;
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ xfs_buf_lock(iclog->ic_bp);
+ xfs_buf_unlock(iclog->ic_bp);
+ iclog = iclog->ic_next;
+ }
+
+ /*
+ * Always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it. Also, cycle the lock
+ * first to ensure we've completed IO on it.
+ */
+ xfs_buf_lock(log->l_xbuf);
+ xfs_buf_unlock(log->l_xbuf);
xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
- for (i=0; i<log->l_iclog_bufs; i++) {
+ for (i = 0; i < log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
@@ -1773,7 +2025,7 @@ xlog_print_tic_res(
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+ xfs_warn(mp, "region[%u]: %s - %u bytes", i,
((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
"bad-rtype" : res_type_str[r_type-1]),
ticket->t_res_arr[i].r_len);
@@ -1781,7 +2033,7 @@ xlog_print_tic_res(
xfs_alert_tag(mp, XFS_PTAG_LOGRES,
"xlog_write: reservation ran out. Need to up reservation");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
}
/*
@@ -1803,6 +2055,10 @@ xlog_write_calc_vec_length(
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+ continue;
+
headers += lv->lv_niovecs;
for (i = 0; i < lv->lv_niovecs; i++) {
@@ -1880,7 +2136,7 @@ xlog_write_setup_ophdr(
* Set up the parameters of the region copy into the log. This has
* to handle region write split across multiple log buffers - this
* state is kept external to this function so that this code can
- * can be written in an obvious, self documenting manner.
+ * be written in an obvious, self documenting manner.
*/
static int
xlog_write_setup_copy(
@@ -2056,7 +2312,7 @@ xlog_write(
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
- while (lv && index < lv->lv_niovecs) {
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2076,13 +2332,22 @@ xlog_write(
* This loop writes out as many regions as can fit in the amount
* of space which was allocated by xlog_state_get_iclog_space().
*/
- while (lv && index < lv->lv_niovecs) {
- struct xfs_log_iovec *reg = &vecp[index];
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+ struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
+ bool ordered = false;
+ /* ordered log vectors have no regions to write */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(lv->lv_niovecs == 0);
+ ordered = true;
+ goto next_lv;
+ }
+
+ reg = &vecp[index];
ASSERT(reg->i_len % sizeof(__int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2142,12 +2407,13 @@ xlog_write(
break;
if (++index == lv->lv_niovecs) {
+next_lv:
lv = lv->lv_next;
index = 0;
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0) {
+ if (record_cnt == 0 && ordered == false) {
if (!lv)
return 0;
break;
@@ -3217,24 +3483,17 @@ xfs_log_ticket_get(
}
/*
- * Allocate and initialise a new log ticket.
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
*/
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int cnt,
- char client,
- bool permanent,
- xfs_km_flags_t alloc_flags)
+int
+xfs_log_calc_unit_res(
+ struct xfs_mount *mp,
+ int unit_bytes)
{
- struct xlog_ticket *tic;
- uint num_headers;
- int iclog_space;
-
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
- if (!tic)
- return NULL;
+ struct xlog *log = mp->m_log;
+ int iclog_space;
+ uint num_headers;
/*
* Permanent reservations have up to 'cnt'-1 active log operations
@@ -3309,23 +3568,46 @@ xlog_ticket_alloc(
unit_bytes += log->l_iclog_hsize;
/* for roundoff padding for transaction data and one for commit record */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1) {
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
/* log su roundoff */
- unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+ unit_bytes += 2 * mp->m_sb.sb_logsunit;
} else {
/* BB roundoff */
- unit_bytes += 2*BBSIZE;
+ unit_bytes += 2 * BBSIZE;
}
+ return unit_bytes;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+ struct xlog *log,
+ int unit_bytes,
+ int cnt,
+ char client,
+ bool permanent,
+ xfs_km_flags_t alloc_flags)
+{
+ struct xlog_ticket *tic;
+ int unit_res;
+
+ tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+ if (!tic)
+ return NULL;
+
+ unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
INIT_LIST_HEAD(&tic->t_queue);
- tic->t_unit_res = unit_bytes;
- tic->t_curr_res = unit_bytes;
+ tic->t_unit_res = unit_res;
+ tic->t_curr_res = unit_res;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = random32();
+ tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
@@ -3451,7 +3733,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing)
+ bool syncing)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3466,11 +3748,9 @@ xlog_verify_iclog(
/* check validity of iclog pointers */
spin_lock(&log->l_icloglock);
icptr = log->l_iclog;
- for (i=0; i < log->l_iclog_bufs; i++) {
- if (icptr == NULL)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
- icptr = icptr->ic_next;
- }
+ for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+ ASSERT(icptr);
+
if (icptr != log->l_iclog)
xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
spin_unlock(&log->l_icloglock);
@@ -3499,7 +3779,7 @@ xlog_verify_iclog(
/* clientid is only 1 byte */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
@@ -3522,7 +3802,7 @@ xlog_verify_iclog(
/* check length */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((__psint_t)&ophead->oh_len -
@@ -3672,11 +3952,14 @@ xfs_log_force_umount(
retval = xlog_state_ioerror(log);
spin_unlock(&log->l_icloglock);
}
+
/*
- * Wake up everybody waiting on xfs_log_force.
- * Callback all log item committed functions as if the
- * log writes were completed.
+ * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+ * as if the log writes were completed. The abort handling in the log
+ * item committed callback functions will do this again under lock to
+ * avoid races.
*/
+ wake_up_all(&log->l_cilp->xc_commit_wait);
xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
#ifdef XFSERRORDEBUG
@@ -3713,3 +3996,4 @@ xlog_iclogs_empty(
} while (iclog != log->l_iclog);
return 1;
}
+