diff options
Diffstat (limited to 'fs/xfs/xfs_log.c')
| -rw-r--r-- | fs/xfs/xfs_log.c | 774 |
1 files changed, 561 insertions, 213 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d90d4a38860..292308dede6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -17,23 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" -#include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -45,51 +45,85 @@ xlog_commit_record( struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp); -STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks); +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_dealloc_log(xlog_t *log); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); -STATIC int xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclog, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp); -STATIC int xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog); -STATIC void xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size); -STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); STATIC void xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC void xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket); + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); STATIC void xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, - int count, boolean_t syncing); -STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn); + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) @@ -97,7 +131,9 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, #define xlog_verify_tail_lsn(a,b,c) #endif -STATIC int xlog_iclogs_empty(xlog_t *log); +STATIC int +xlog_iclogs_empty( + struct xlog *log); static void xlog_grant_sub_space( @@ -219,7 +255,8 @@ xlog_grant_head_wait( struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) { list_add_tail(&tic->t_queue, &head->waiters); @@ -422,7 +459,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); @@ -575,13 +613,16 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - int error; + int error = 0; + int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -592,6 +633,50 @@ xfs_log_mount( } /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* * Initialize the AIL now we have a log. */ error = xfs_trans_ail_init(mp); @@ -643,25 +728,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -677,14 +766,14 @@ xfs_log_mount_finish(xfs_mount_t *mp) * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now since that particular field isn't - * currently architecture converted and "nUmount" is a bit foo. + * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ int xfs_log_unmount_write(xfs_mount_t *mp) { - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; xlog_in_core_t *iclog; #ifdef DEBUG xlog_in_core_t *first_iclog; @@ -814,15 +903,49 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Empty the log for unmount/freeze. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -877,27 +1000,34 @@ xfs_log_space_wake( } /* - * Determine if we have a transaction that has gone to disk - * that needs to be covered. To begin the transition to the idle state - * firstly the log needs to be idle (no AIL and nothing in the iclogs). - * If we are then in a state where covering is needed, the caller is informed - * that dummy transactions are required to move the log into the idle state. + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. * - * Because this is called as part of the sync process, we should also indicate - * that dummy transactions should be issued in anything but the covered or - * idle states. This ensures that the log tail is accurately reflected in - * the log at the end of the sync, hence if a crash occurrs avoids replay - * of transactions where the metadata is already on disk. + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. */ int xfs_log_need_covered(xfs_mount_t *mp) { + struct xlog *log = mp->m_log; int needed = 0; - xlog_t *log = mp->m_log; if (!xfs_fs_writable(mp)) return 0; + if (!xlog_cil_empty(log)) + return 0; + spin_lock(&log->l_icloglock); switch (log->l_covered_state) { case XLOG_STATE_COVER_DONE: @@ -906,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp) break; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: - if (!xfs_ail_min_lsn(log->l_ailp) && - xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else - log->l_covered_state = XLOG_STATE_COVER_DONE2; - } - /* FALLTHRU */ + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; default: needed = 1; break; @@ -945,6 +1078,7 @@ xlog_assign_tail_lsn_locked( tail_lsn = lip->li_lsn; else tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } @@ -1024,14 +1158,14 @@ xlog_space_left( void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog = bp->b_fspriv; - xlog_t *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); @@ -1049,13 +1183,15 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ - -} /* xlog_iodone */ + xfs_buf_unlock(bp); +} /* * Return size of each in-core log record buffer. @@ -1067,8 +1203,9 @@ xlog_iodone(xfs_buf_t *bp) */ STATIC void -xlog_get_iclog_buffer_size(xfs_mount_t *mp, - xlog_t *log) +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) { int size; int xhdrs; @@ -1124,18 +1261,53 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, * some other stuff may be filled in too. */ -STATIC xlog_t * -xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - xlog_t *log; + struct xlog *log; xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; @@ -1144,7 +1316,7 @@ xlog_alloc_log(xfs_mount_t *mp, int error = ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1157,6 +1329,7 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -1200,8 +1373,16 @@ xlog_alloc_log(xfs_mount_t *mp, bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1230,6 +1411,9 @@ xlog_alloc_log(xfs_mount_t *mp, if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1254,7 +1438,6 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1379,12 +1562,96 @@ xlog_grant_push_ail( } /* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to * shutdown. Actually, in practice, even when we didn't get a log error, * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1392,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1399,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1434,10 +1703,10 @@ xlog_bdstrat( */ STATIC int -xlog_sync(xlog_t *log, - xlog_in_core_t *iclog) +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1446,6 +1715,7 @@ xlog_sync(xlog_t *log, int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1476,13 +1746,10 @@ xlog_sync(xlog_t *log, xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1491,12 +1758,36 @@ xlog_sync(xlog_t *log, /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1524,7 +1815,7 @@ xlog_sync(xlog_t *log, ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -1550,19 +1841,6 @@ xlog_sync(xlog_t *log, bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1579,12 +1857,12 @@ xlog_sync(xlog_t *log, return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ STATIC void -xlog_dealloc_log(xlog_t *log) +xlog_dealloc_log( + struct xlog *log) { xlog_in_core_t *iclog, *next_iclog; int i; @@ -1592,14 +1870,28 @@ xlog_dealloc_log(xlog_t *log) xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); @@ -1616,10 +1908,11 @@ xlog_dealloc_log(xlog_t *log) */ /* ARGSUSED */ static inline void -xlog_state_finish_copy(xlog_t *log, - xlog_in_core_t *iclog, - int record_cnt, - int copy_bytes) +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) { spin_lock(&log->l_icloglock); @@ -1732,7 +2025,7 @@ xlog_print_tic_res( for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes\n", i, + xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); @@ -1740,7 +2033,7 @@ xlog_print_tic_res( xfs_alert_tag(mp, XFS_PTAG_LOGRES, "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); } /* @@ -1762,6 +2055,10 @@ xlog_write_calc_vec_length( headers++; for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + headers += lv->lv_niovecs; for (i = 0; i < lv->lv_niovecs; i++) { @@ -1839,7 +2136,7 @@ xlog_write_setup_ophdr( * Set up the parameters of the region copy into the log. This has * to handle region write split across multiple log buffers - this * state is kept external to this function so that this code can - * can be written in an obvious, self documenting manner. + * be written in an obvious, self documenting manner. */ static int xlog_write_setup_copy( @@ -2015,7 +2312,7 @@ xlog_write( index = 0; lv = log_vector; vecp = lv->lv_iovecp; - while (lv && index < lv->lv_niovecs) { + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2035,13 +2332,22 @@ xlog_write( * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ - while (lv && index < lv->lv_niovecs) { - struct xfs_log_iovec *reg = &vecp[index]; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; int start_rec_copy; int copy_len; int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + reg = &vecp[index]; ASSERT(reg->i_len % sizeof(__int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); @@ -2101,12 +2407,13 @@ xlog_write( break; if (++index == lv->lv_niovecs) { +next_lv: lv = lv->lv_next; index = 0; if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0) { + if (record_cnt == 0 && ordered == false) { if (!lv) return 0; break; @@ -2142,7 +2449,8 @@ xlog_write( * State Change: DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log(xlog_t *log) +xlog_state_clean_log( + struct xlog *log) { xlog_in_core_t *iclog; int changed = 0; @@ -2222,7 +2530,7 @@ xlog_state_clean_log(xlog_t *log) STATIC xfs_lsn_t xlog_get_lowest_lsn( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; @@ -2245,9 +2553,9 @@ xlog_get_lowest_lsn( STATIC void xlog_state_do_callback( - xlog_t *log, - int aborted, - xlog_in_core_t *ciclog) + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've @@ -2345,14 +2653,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; @@ -2467,7 +2788,7 @@ xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { - xlog_t *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); @@ -2521,12 +2842,13 @@ xlog_state_done_syncing( * is copied. */ STATIC int -xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclogp, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp) +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) { int log_offset; xlog_rec_header_t *head; @@ -2631,8 +2953,9 @@ restart: * move grant reservation head forward. */ STATIC void -xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { trace_xfs_log_regrant_reserve_enter(log, ticket); @@ -2677,8 +3000,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, * in the current reservation field. */ STATIC void -xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { int bytes; @@ -2717,8 +3041,8 @@ xlog_ungrant_log_space(xlog_t *log, */ STATIC int xlog_state_release_iclog( - xlog_t *log, - xlog_in_core_t *iclog) + struct xlog *log, + struct xlog_in_core *iclog) { int sync = 0; /* do we sync? */ @@ -2768,9 +3092,10 @@ xlog_state_release_iclog( * that every data block. We have run out of space in this log record. */ STATIC void -xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size) +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) @@ -3114,7 +3439,9 @@ xfs_log_force_lsn( * disk. */ STATIC void -xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) { assert_spin_locked(&log->l_icloglock); @@ -3156,24 +3483,17 @@ xfs_log_ticket_get( } /* - * Allocate and initialise a new log ticket. + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. */ -xlog_ticket_t * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int cnt, - char client, - bool permanent, - xfs_km_flags_t alloc_flags) +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) { - struct xlog_ticket *tic; - uint num_headers; - int iclog_space; - - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3248,23 +3568,46 @@ xlog_ticket_alloc( unit_bytes += log->l_iclog_hsize; /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ - unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + unit_bytes += 2 * mp->m_sb.sb_logsunit; } else { /* BB roundoff */ - unit_bytes += 2*BBSIZE; + unit_bytes += 2 * BBSIZE; } + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + atomic_set(&tic->t_ref, 1); tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); - tic->t_unit_res = unit_bytes; - tic->t_curr_res = unit_bytes; + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = random32(); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; @@ -3346,9 +3689,10 @@ xlog_verify_grant_tail( /* check if it will fit */ STATIC void -xlog_verify_tail_lsn(xlog_t *log, - xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn) +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) { int blocks; @@ -3385,10 +3729,11 @@ xlog_verify_tail_lsn(xlog_t *log, * the cycle numbers agree with the current cycle number. */ STATIC void -xlog_verify_iclog(xlog_t *log, - xlog_in_core_t *iclog, - int count, - boolean_t syncing) +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3403,11 +3748,9 @@ xlog_verify_iclog(xlog_t *log, /* check validity of iclog pointers */ spin_lock(&log->l_icloglock); icptr = log->l_iclog; - for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == NULL) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); - icptr = icptr->ic_next; - } + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + if (icptr != log->l_iclog) xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); spin_unlock(&log->l_icloglock); @@ -3436,7 +3779,7 @@ xlog_verify_iclog(xlog_t *log, /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3459,7 +3802,7 @@ xlog_verify_iclog(xlog_t *log, /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - @@ -3482,7 +3825,7 @@ xlog_verify_iclog(xlog_t *log, */ STATIC int xlog_state_ioerror( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *iclog, *ic; @@ -3527,7 +3870,7 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_t *log; + struct xlog *log; int retval; log = mp->m_log; @@ -3609,11 +3952,14 @@ xfs_log_force_umount( retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); } + /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG @@ -3634,7 +3980,8 @@ xfs_log_force_umount( } STATIC int -xlog_iclogs_empty(xlog_t *log) +xlog_iclogs_empty( + struct xlog *log) { xlog_in_core_t *iclog; @@ -3649,3 +3996,4 @@ xlog_iclogs_empty(xlog_t *log) } while (iclog != log->l_iclog); return 1; } + |
