diff options
Diffstat (limited to 'fs/xfs/xfs_buf.c')
| -rw-r--r-- | fs/xfs/xfs_buf.c | 417 |
1 files changed, 228 insertions, 189 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614760c..7a34a1ae655 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -34,11 +34,13 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" +#include "xfs_log.h" static kmem_zone_t *xfs_buf_zone; @@ -80,54 +82,6 @@ xfs_buf_vmap_len( } /* - * xfs_buf_lru_add - add a buffer to the LRU. - * - * The LRU takes a new reference to the buffer so that it will only be freed - * once the shrinker takes the buffer off the LRU. - */ -STATIC void -xfs_buf_lru_add( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - spin_lock(&btp->bt_lru_lock); - if (list_empty(&bp->b_lru)) { - atomic_inc(&bp->b_hold); - list_add_tail(&bp->b_lru, &btp->bt_lru); - btp->bt_lru_nr++; - bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* - * xfs_buf_lru_del - remove a buffer from the LRU - * - * The unlocked check is safe here because it only occurs when there are not - * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there - * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unnecessary round trip on the - * bt_lru_lock. - */ -STATIC void -xfs_buf_lru_del( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - if (list_empty(&bp->b_lru)) - return; - - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -150,20 +104,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - atomic_set(&(bp)->b_lru_ref, 0); - if (!list_empty(&bp->b_lru)) { - struct xfs_buftarg *btp = bp->b_target; + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru) && - !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - atomic_dec(&bp->b_hold); - } - spin_unlock(&btp->bt_lru_lock); - } ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); } static int @@ -227,6 +175,7 @@ _xfs_buf_alloc( INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); bp->b_target = target; bp->b_flags = flags; @@ -267,8 +216,7 @@ _xfs_buf_alloc( STATIC int _xfs_buf_get_pages( xfs_buf_t *bp, - int page_count, - xfs_buf_flags_t flags) + int page_count) { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { @@ -303,7 +251,7 @@ _xfs_buf_free_pages( * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. - * The buffer most not be on any hash - use xfs_buf_rele instead for + * The buffer must not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ void @@ -381,7 +329,7 @@ use_alloc_page: end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; - error = _xfs_buf_get_pages(bp, page_count, flags); + error = _xfs_buf_get_pages(bp, page_count); if (unlikely(error)) return error; @@ -447,7 +395,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; + unsigned noio_flag; + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -455,6 +413,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); if (!bp->b_addr) return -ENOMEM; @@ -487,6 +446,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -495,8 +455,26 @@ _xfs_buf_find( numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + WARN_ON(1); + return NULL; + } /* get tree root */ pag = xfs_perag_get(btp->bt_mount, @@ -623,7 +601,7 @@ found: error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pagesn", __func__); xfs_buf_relse(bp); return NULL; } @@ -730,7 +708,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -795,7 +777,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count); if (rval) return rval; @@ -828,7 +810,7 @@ xfs_buf_get_uncached( goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count, 0); + error = _xfs_buf_get_pages(bp, page_count); if (error) goto fail_free_buf; @@ -842,7 +824,7 @@ xfs_buf_get_uncached( error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pages", __func__); goto fail_free_mem; } @@ -897,12 +879,33 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (!(bp->b_flags & XBF_STALE) && - atomic_read(&bp->b_lru_ref)) { - xfs_buf_lru_add(bp); + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); spin_unlock(&pag->pag_buf_lock); } else { - xfs_buf_lru_del(bp); + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -933,8 +936,6 @@ xfs_buf_trylock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); - else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_trylock(bp, _RET_IP_); return locked; @@ -1006,7 +1007,9 @@ xfs_buf_iodone_work( bool read = !!(bp->b_flags & XBF_READ); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - if (read && bp->b_ops) + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) bp->b_ops->verify_read(bp); if (bp->b_iodone) @@ -1100,7 +1103,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1163,7 +1166,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1175,25 +1178,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1266,7 +1250,7 @@ next_chunk: bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1288,7 +1272,7 @@ next_chunk: total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1318,6 +1302,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; @@ -1381,21 +1371,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( @@ -1476,81 +1474,127 @@ xfs_buf_iomove( * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_buf *bp; + LIST_HEAD(dispose); + int loop = 0; -restart: - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - if (atomic_read(&bp->b_hold) > 1) { - spin_unlock(&btp->bt_lru_lock); - delay(100); - goto restart; + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } + xfs_buf_rele(bp); } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - spin_unlock(&btp->bt_lru_lock); - xfs_buf_rele(bp); - spin_lock(&btp->bt_lru_lock); + if (loop++ != 0) + delay(100); } - spin_unlock(&btp->bt_lru_lock); } -int -xfs_buftarg_shrink( +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - struct xfs_buf *bp; - int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; - if (!nr_to_scan) - return btp->bt_lru_nr; - - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - if (nr_to_scan-- <= 0) - break; - - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - - /* - * Decrement the b_lru_ref count unless the value is already - * zero. If the value is already zero, we need to reclaim the - * buffer, otherwise it gets another trip through the LRU. - */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - list_move_tail(&bp->b_lru, &btp->bt_lru); - continue; - } - - /* - * remove the buffer from the LRU now to avoid needing another - * lock round trip inside xfs_buf_rele(). - */ - list_move(&bp->b_lru, &dispose); - btp->bt_lru_nr--; - bp->b_lru_flags |= _XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); while (!list_empty(&dispose)) { + struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_rele(bp); } - return btp->bt_lru_nr; + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); } void @@ -1559,6 +1603,7 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); @@ -1566,16 +1611,14 @@ xfs_free_buftarg( kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@ -1583,47 +1626,39 @@ xfs_setsize_buftarg_flags( bdevname(btp->bt_bdev, name); xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %s\n", + "Cannot set_blocksize to %u on device %s", sectorsize, name); return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used is at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - int external, - const char *fsname) + struct block_device *bdev) { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; @@ -1632,12 +1667,16 @@ xfs_alloc_buftarg( if (!btp->bt_bdi) goto error; - INIT_LIST_HEAD(&btp->bt_lru); - spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - btp->bt_shrinker.shrink = xfs_buftarg_shrink; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&btp->bt_shrinker); return btp; @@ -1759,7 +1798,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { |
