diff options
Diffstat (limited to 'fs/xfs/xfs_buf_item.c')
| -rw-r--r-- | fs/xfs/xfs_buf_item.c | 904 |
1 files changed, 496 insertions, 408 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 6f8c21ce0d6..4654338b03f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,18 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_log.h" kmem_zone_t *xfs_buf_item_zone; @@ -38,110 +38,15 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) -{ - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLF_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } -} - -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip; - uint nbytes; - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { - return; - } - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) { - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); - ASSERT(XFS_BUF_PTR(bp) != NULL); - orig = bip->bli_orig; - buffer = XFS_BUF_PTR(bp); - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) - cmn_err(CE_PANIC, - "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", - bip, bp, orig, x); - } + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - -STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* * This returns the number of log iovecs needed to log the @@ -153,34 +58,28 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -STATIC uint -xfs_buf_item_size( - struct xfs_log_item *lip) +STATIC void +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { - struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - uint nvecs; int next_bit; int last_bit; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - if (bip->bli_flags & XFS_BLI_STALE) { - /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. - */ - trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - return 1; - } + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; - ASSERT(bip->bli_flags & XFS_BLI_LOGGED); - nvecs = 1; - last_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(last_bit != -1); - nvecs++; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -188,88 +87,165 @@ xfs_buf_item_size( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { - last_bit = -1; + break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + XFS_BLF_CHUNK)) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else { last_bit++; } + *nbytes += XFS_BLF_CHUNK; } - - trace_xfs_buf_item_size(bip); - return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given log buf item. It fills the first entry with a buf log - * format structure, and the rest point to contiguous chunks - * within the buffer. + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. */ STATIC void -xfs_buf_item_format( +xfs_buf_item_size( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + int *nvecs, + int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); + } + trace_xfs_buf_item_size(bip); +} + +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || - (bip->bli_flags & XFS_BLI_STALE)); + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->__bli_format.blf_flags; /* - * The size of the base structure is the size of the - * declared structure plus the space for the extra words - * of the bitmap. We subtract one from the map size, because - * the first element of the bitmap is accounted for in the - * size of the base structure. + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. */ - base_size = - (uint)(sizeof(xfs_buf_log_format_t) + - ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = &bip->bli_format; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + base_size = xfs_buf_log_format_size(blfp); - /* - * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer - * this state if the inode buffer allocation has not yet been committed - * to the log as setting the XFS_BLI_INODE_BUF flag will prevent - * correct replay of the inode allocation. - */ - if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; - bip->bli_flags &= ~XFS_BLI_INODE_BUF; + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + return; } + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -277,17 +253,14 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - bip->bli_format.blf_size = nvecs; + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); return; } + /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(first_bit != -1); last_bit = first_bit; nbits = 1; for (;;) { @@ -297,48 +270,25 @@ xfs_buf_item_format( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - (uint)last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + - XFS_BLF_CHUNK)) { - buffer_offset = first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -347,13 +297,71 @@ xfs_buf_item_format( nbits++; } } - bip->bli_format.blf_size = nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } /* * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); - xfs_buf_item_log_check(bip); } /* @@ -371,9 +379,9 @@ xfs_buf_item_pin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); @@ -406,7 +414,7 @@ xfs_buf_item_unpin( int stale = bip->bli_flags & XFS_BLI_STALE; int freed; - ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(bp->b_fspriv == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); @@ -418,10 +426,9 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -441,7 +448,7 @@ xfs_buf_item_unpin( * Since the transaction no longer refers to the buffer, * the buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + bp->b_transp = NULL; } /* @@ -452,46 +459,90 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); + } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ + xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } } /* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, take a - * reference to the buffer. If this is a delayed write buffer that - * needs AIL help to be written back, invoke the pushbuf routine - * rather than the normal success path. + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint -xfs_buf_item_trylock( - struct xfs_log_item *lip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - if (!XFS_BUF_CPSEMA(bp)) + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; - - /* take a reference to the buffer. */ - XFS_BUF_HOLD(bp); + } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - trace_xfs_buf_item_trylock(bip); - if (XFS_BUF_ISDELAYWRITE(bp)) - return XFS_ITEM_PUSHBUF; - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* @@ -519,11 +570,12 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted; - uint hold; + bool clean; + bool aborted; + int flags; /* Clear the buffer's association with this transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + bp->b_transp = NULL; /* * If this is a transaction abort, don't return early. Instead, allow @@ -531,25 +583,23 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - - /* Clear the per transaction state. */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { + if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; @@ -560,15 +610,49 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. + */ + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } + } + } + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. */ - if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) - xfs_buf_item_relse(bp); - else - atomic_dec(&bip->bli_refcount); + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } + xfs_buf_item_relse(bp); + } + } - if (!hold) + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); } @@ -604,48 +688,6 @@ xfs_buf_item_committed( return lsn; } -/* - * The buffer is locked, but is not a delayed write buffer. This happens - * if we race with IO completion and hence we don't want to try to write it - * again. Just release the buffer. - */ -STATIC void -xfs_buf_item_push( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_push(bip); - - xfs_buf_relse(bp); -} - -/* - * The buffer is locked and is a delayed write buffer. Promote the buffer - * in the delayed write queue as the caller knows that they must invoke - * the xfsbufd to get this buffer written. We have to unlock the buffer - * to allow the xfsbufd to write it, too. - */ -STATIC void -xfs_buf_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_pushbuf(bip); - - xfs_buf_delwri_promote(bp); - xfs_buf_relse(bp); -} - STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, @@ -656,19 +698,46 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_trylock = xfs_buf_item_trylock, .iop_unlock = xfs_buf_item_unlock, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_pushbuf = xfs_buf_item_pushbuf, .iop_committing = xfs_buf_item_committing }; +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} /* * Allocate a new buf log item to go with the given buffer. @@ -682,10 +751,12 @@ xfs_buf_item_init( xfs_buf_t *bp, xfs_mount_t *mp) { - xfs_log_item_t *lip; + xfs_log_item_t *lip = bp->b_fspriv; xfs_buf_log_item_t *bip; int chunks; int map_size; + int error; + int i; /* * Check to see if there is already a buf log item for @@ -694,55 +765,44 @@ xfs_buf_item_init( * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - if (lip->li_type == XFS_LI_BUF) { - return; - } - } - - /* - * chunks is the number of XFS_BLF_CHUNK size pieces - * the buffer can be divided into. Make sure not to - * truncate any pieces. map_size is the size of the - * bitmap needed to describe the chunks of the buffer. - */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); - map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); + if (lip != NULL && lip->li_type == XFS_LI_BUF) + return; - bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, - KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); - bip->bli_format.blf_type = XFS_LI_BUF; - bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); - bip->bli_format.blf_map_size = map_size; -#ifdef XFS_TRANS_DEBUG /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); -#endif + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } /* * Put the buf item into the list of items attached to the * buffer at the front. */ - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - bip->bli_item.li_bio_list = - XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - } - XFS_BUF_SET_FSPRIVATE(bp, bip); + if (bp->b_fspriv) + bip->bli_item.li_bio_list = bp->b_fspriv; + bp->b_fspriv = bip; } @@ -750,11 +810,11 @@ xfs_buf_item_init( * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ -void -xfs_buf_item_log( - xfs_buf_log_item_t *bip, +static void +xfs_buf_item_log_segment( uint first, - uint last) + uint last, + uint *map) { uint first_bit; uint last_bit; @@ -767,12 +827,6 @@ xfs_buf_item_log( uint mask; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * Convert byte offsets to bit numbers. */ first_bit = first >> XFS_BLF_SHIFT; @@ -788,7 +842,7 @@ xfs_buf_item_log( * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &map[word_num]; /* * Calculate the starting bit in the first word. @@ -831,13 +885,50 @@ xfs_buf_item_log( mask = (1 << end_bit) - 1; *wordp |= mask; } +} + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; - xfs_buf_item_log_debug(bip, first, last); + xfs_buf_item_log_segment(first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } } /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -851,11 +942,7 @@ STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) { -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - kmem_free(bip->bli_logged); -#endif /* XFS_TRANS_DEBUG */ - + xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -870,16 +957,15 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + + bp->b_fspriv = bip->bli_item.li_bio_list; + if (bp->b_fspriv == NULL) + bp->b_iodone = NULL; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); - if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && - (XFS_BUF_IODONE_FUNC(bp) != NULL)) { - XFS_BUF_CLR_IODONE_FUNC(bp); - } xfs_buf_rele(bp); xfs_buf_item_free(bip); } @@ -902,21 +988,20 @@ xfs_buf_attach_iodone( { xfs_log_item_t *head_lip; - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + head_lip = bp->b_fspriv; + if (head_lip) { lip->li_bio_list = head_lip->li_bio_list; head_lip->li_bio_list = lip; } else { - XFS_BUF_SET_FSPRIVATE(bp, lip); + bp->b_fspriv = lip; } - ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || - (XFS_BUF_IODONE_FUNC(bp) == NULL)); - XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; } /* @@ -937,8 +1022,8 @@ xfs_buf_do_callbacks( { struct xfs_log_item *lip; - while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) { - XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list); + while ((lip = bp->b_fspriv) != NULL) { + bp->b_fspriv = lip->li_bio_list; ASSERT(lip->li_cb != NULL); /* * Clear the next pointer so we don't have any @@ -967,7 +1052,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!XFS_BUF_GETERROR(bp))) + if (likely(!bp->b_error)) goto do_callbacks; /* @@ -975,40 +1060,45 @@ xfs_buf_iodone_callbacks( * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } - if (XFS_BUF_TARGET(bp) != lasttarg || + if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + xfs_buf_ioerror_alert(bp, __func__); } - lasttarg = XFS_BUF_TARGET(bp); + lasttarg = bp->b_target; /* - * If the write was asynchronous then noone will be looking for the + * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * - * During sync or umount we'll write all pending buffers again - * synchronous, which will catch these errors if they keep hanging - * around. + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { - XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + ASSERT(bp->b_iodone != NULL); - if (!XFS_BUF_ISSTALE(bp)) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); + + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; + xfs_buf_iorequest(bp); + } else { + xfs_buf_relse(bp); + } + return; } @@ -1016,17 +1106,15 @@ xfs_buf_iodone_callbacks( * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); do_callbacks: xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -1058,6 +1146,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } |
