diff options
Diffstat (limited to 'fs/xfs')
86 files changed, 3028 insertions, 3073 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 56641fe52a2..b4769e40e8b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,7 +16,7 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char +EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 XFS_LINUX := linux-2.6 @@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_globals.o \ xfs_ioctl.o \ xfs_iops.o \ - xfs_lrw.o \ xfs_super.o \ xfs_sync.o \ xfs_xattr.o) diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 2d3f90afe5f..bc7405585de 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -16,7 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> -#include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/blkdev.h> @@ -24,8 +23,25 @@ #include "time.h" #include "kmem.h" -#define MAX_VMALLOCS 6 -#define MAX_SLAB_SIZE 0x20000 +/* + * Greedy allocation. May fail and may return vmalloced memory. + * + * Must be freed using kmem_free_large. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = kmem_zalloc_large(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} void * kmem_alloc(size_t size, unsigned int __nocast flags) @@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; -#ifdef DEBUG - if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { - printk(KERN_WARNING "Large %s attempt, size=%ld\n", - __func__, (long)size); - dump_stack(); - } -#endif - do { - if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) - ptr = kmalloc(size, lflags); - else - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = kmalloc(size, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) @@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) return ptr; } -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, - unsigned int __nocast flags) -{ - void *ptr; - size_t kmsize = maxsize; - unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP; - - while (!(ptr = kmem_zalloc(kmsize, kmflags))) { - if ((kmsize <= minsize) && (flags & KM_NOSLEEP)) - break; - if ((kmsize >>= 1) <= minsize) { - kmsize = minsize; - kmflags = flags; - } - } - if (ptr) - *size = kmsize; - return ptr; -} - void kmem_free(const void *ptr) { diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 179cbd630f6..f7c8f7a9ea6 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/vmalloc.h> /* * General memory allocation interfaces @@ -30,7 +31,6 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u -#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags) extern void *kmem_alloc(size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); extern void kmem_free(const void *); +static inline void *kmem_zalloc_large(size_t size) +{ + void *ptr; + + ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} +static inline void kmem_free_large(void *ptr) +{ + vfree(ptr); +} + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + /* * Zone interfaces */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 883ca5ab8af..bf85bbe4a9a 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type) struct posix_acl *acl; struct xfs_acl *xfs_acl; int len = sizeof(struct xfs_acl); - char *ea_name; + unsigned char *ea_name; int error; acl = get_cached_acl(inode, type); @@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type) if (!xfs_acl) return ERR_PTR(-ENOMEM); - error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); + error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -162,7 +163,7 @@ STATIC int xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); - char *ea_name; + unsigned char *ea_name; int error; if (S_ISLNK(inode->i_mode)) @@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) (sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES - acl->a_count)); - error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, + error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); kfree(xfs_acl); @@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode) } static int -xfs_acl_exists(struct inode *inode, char *name) +xfs_acl_exists(struct inode *inode, unsigned char *name) { int len = sizeof(struct xfs_acl); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c121..9083357f9e4 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -39,6 +39,7 @@ #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_bmap.h" #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> @@ -163,14 +164,17 @@ xfs_ioend_new_eof( } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof i_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. */ - -STATIC void +STATIC int xfs_setfilesize( xfs_ioend_t *ioend) { @@ -181,16 +185,40 @@ xfs_setfilesize( ASSERT(ioend->io_type != IOMAP_READ); if (unlikely(ioend->io_error)) - return; + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; - xfs_mark_inode_dirty_sync(ip); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq; + + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } } /* @@ -198,11 +226,11 @@ xfs_setfilesize( */ STATIC void xfs_end_io( - struct work_struct *work) + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; /* * For unwritten extents we need to issue transactions to convert a @@ -210,7 +238,6 @@ xfs_end_io( */ if (ioend->io_type == IOMAP_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - int error; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); @@ -222,30 +249,23 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type != IOMAP_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); } + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend, 0); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else + xfs_destroy_ioend(ioend); } /* @@ -341,7 +361,7 @@ xfs_submit_ioend_bio( * but don't update the inode size until I/O completion. */ if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); @@ -874,6 +894,118 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + + if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL, NULL); + + if (error) { + /* something screwed, just bail */ + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() |