diff options
Diffstat (limited to 'fs/xfs/xfs_iomap.c')
| -rw-r--r-- | fs/xfs/xfs_iomap.c | 412 |
1 files changed, 296 insertions, 116 deletions
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 246c7d57c6f..6d3ec2b6ee2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,32 +17,28 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_dinode.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -108,7 +104,7 @@ xfs_alert_fsblock_zero( xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, "Access to block zero in inode %llu " "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x\n", + "blkcnt: %llx extent-state: %x", (unsigned long long)ip->i_ino, (unsigned long long)imap->br_startblock, (unsigned long long)imap->br_startoff, @@ -132,7 +128,6 @@ xfs_iomap_write_direct( xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -141,11 +136,7 @@ xfs_iomap_write_direct( int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); @@ -157,7 +148,7 @@ xfs_iomap_write_direct( if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -189,76 +180,68 @@ xfs_iomap_write_direct( /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = 0; - if (offset < XFS_ISIZE(ip) || extsz) - bmapi_flag |= XFS_BMAPI_PREALLOC; - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = ENOSPC; - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) error = xfs_alert_fsblock_zero(ip, imap); - goto error_out; - } - return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* @@ -290,11 +273,20 @@ xfs_iomap_eof_want_preallocate( return 0; /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* * If there are any real blocks past eof, then don't * do any speculative allocation. */ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; @@ -319,6 +311,121 @@ xfs_iomap_eof_want_preallocate( } /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount << 1; + return XFS_B_TO_FSB(mp, offset); +} + +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + +/* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the @@ -327,43 +434,95 @@ xfs_iomap_eof_want_preallocate( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_mount *mp, - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) { xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { - int shift = 0; - int64_t freesp; + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; - /* - * rounddown_pow_of_two() returns an undefined result - * if we pass in alloc_blocks = 0. Hence the "+ 1" to - * ensure we always pass in a non-zero value. - */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; - alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, - rounddown_pow_of_two(alloc_blocks)); - - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } - if (shift) - alloc_blocks >>= shift; + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; } + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + return alloc_blocks; } @@ -382,7 +541,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -398,7 +557,6 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, imap, XFS_WRITE_IMAPS, &prealloc); if (error) @@ -406,7 +564,10 @@ xfs_iomap_write_delay( retry: if (prealloc) { - xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); @@ -421,6 +582,15 @@ retry: return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, imap, &nimaps, XFS_BMAPI_ENTIRE); @@ -434,31 +604,29 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } @@ -477,7 +645,6 @@ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, - size_t count, xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; @@ -519,10 +686,8 @@ xfs_iomap_write_allocate( tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -565,7 +730,7 @@ xfs_iomap_write_allocate( */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(NULL, ip, &last_block, + error = xfs_bmap_last_offset(ip, &last_block, XFS_DATA_FORK); if (error) goto trans_cancel; @@ -584,7 +749,8 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, 0, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -645,6 +811,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; + xfs_fsize_t i_size; uint resblks; int committed; int error; @@ -679,13 +846,11 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -705,7 +870,22 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), &committed); + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error_on_bmapi_transaction; |
