diff options
63 files changed, 2652 insertions, 2170 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index bdd82b2339d..9858f337529 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -189,7 +189,7 @@ prototypes: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); - int (*invalidatepage) (struct page *, unsigned long); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, @@ -310,8 +310,8 @@ filesystems and by the swapper. The latter will eventually go away. Please, keep it that way and don't breed new callers. ->invalidatepage() is called when the filesystem must attempt to drop -some or all of the buffers from the page when it is being truncated. It -returns zero on success. If ->invalidatepage is zero, the kernel uses +some or all of the buffers from the page when it is being truncated. It +returns zero on success. If ->invalidatepage is zero, the kernel uses block_invalidatepage() instead. ->releasepage() is called when the kernel is about to try to drop the diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 4a35f6614a6..e6bd1ffd821 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -549,7 +549,7 @@ struct address_space_operations ------------------------------- This describes how the VFS can manipulate mapping of a file to page cache in -your filesystem. As of kernel 2.6.22, the following members are defined: +your filesystem. The following members are defined: struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); @@ -566,7 +566,7 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); - int (*invalidatepage) (struct page *, unsigned long); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, @@ -685,14 +685,14 @@ struct address_space_operations { invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed from the address space. This generally corresponds to either a - truncation or a complete invalidation of the address space - (in the latter case 'offset' will always be 0). - Any private data associated with the page should be updated - to reflect this truncation. If offset is 0, then - the private data should be released, because the page - must be able to be completely discarded. This may be done by - calling the ->releasepage function, but in this case the - release MUST succeed. + truncation, punch hole or a complete invalidation of the address + space (in the latter case 'offset' will always be 0 and 'length' + will be PAGE_CACHE_SIZE). Any private data associated with the page + should be updated to reflect this truncation. If offset is 0 and + length is PAGE_CACHE_SIZE, then the private data should be released, + because the page must be able to be completely discarded. This may + be done by calling the ->releasepage function, but in this case the + release MUST succeed. releasepage: releasepage is called on PagePrivate pages to indicate that the page should be freed if possible. ->releasepage diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 055562c580b..9ff073f4090 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) * @offset: offset in the page */ -static void v9fs_invalidate_page(struct page *page, unsigned long offset) +static void v9fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { /* * If called with zero offset, we should release * the private state assocated with the page */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) v9fs_fscache_invalidate_page(page); } diff --git a/fs/afs/file.c b/fs/afs/file.c index 8f6e9234d56..66d50fe2ee4 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,8 @@ #include "internal.h" static int afs_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned long offset); +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static int afs_launder_page(struct page *page); @@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page) * - release a page and clean up its private data if offset is 0 (indicating * the entire page) */ -static void afs_invalidatepage(struct page *page, unsigned long offset) +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct afs_writeback *wb = (struct afs_writeback *) page_private(page); - _enter("{%lu},%lu", page->index, offset); + _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); /* we clean up only if the entire page is being invalidated */ - if (offset == 0) { + if (offset == 0 && length == PAGE_CACHE_SIZE) { #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8b60b660c8..b0292b3ead5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned long offset) +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e7e7afb4a87..6bca9472f31 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2957,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a46b656d08d..4f9d16b70d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7493,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct extent_io_tree *tree; diff --git a/fs/buffer.c b/fs/buffer.c index d2a4d1bb2d5..f93392e2df1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh) * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * block_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh) * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned long offset) +void block_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; + unsigned int stop = length + offset; BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) goto out; + /* + * Check for overflow + */ + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { @@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset) next = bh->b_this_page; /* + * Are we still fully in range ? + */ + if (next_off > stop) + goto out; + + /* * is this block fully invalidated? */ if (offset <= curr_off) @@ -1501,6 +1515,7 @@ out: } EXPORT_SYMBOL(block_invalidatepage); + /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers @@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */ - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; /* don't care */ } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac10104..38b5c1bc677 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page) * dirty page counters appropriately. Only called if there is private * data on the page. */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode; struct ceph_inode_info *ci; @@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); ci = ceph_inode(inode); - if (offset == 0) { - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", - inode, page, page->index, offset); + if (offset == 0 && length == PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); page->private = 0; ClearPagePrivate(page); } else { - dout("%p invalidatepage %p idx %lu partial dirty page\n", - inode, page, page->index); + dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", + inode, page, page->index, offset, length); } } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 48b29d24c9f..4d8ba8d491e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3546,11 +3546,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp) return cifs_fscache_release_page(page, gfp); } -static void cifs_invalidate_page(struct page *page, unsigned long offset) +static void cifs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d1f80abd882..2ec8eb1ab26 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) return 0; } -static void exofs_invalidatepage(struct page *page, unsigned long offset) +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); WARN_ON(1); } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 23c71282564..f67668f724b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); - trace_ext3_invalidatepage(page, offset); + trace_ext3_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 692de13e359..cea8ecf3e76 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); - return count; + /* silently ignore the rest of the block */ + break; } ext3fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d0f13eada0e..58339393fa6 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) static inline int test_root(ext4_group_t a, int b) { - int num = b; - - while (a > num) - num *= b; - return num == a; + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } } static int ext4_group_sparse(ext4_group_t group) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4af03ea84aa..b577e45425b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,38 +177,28 @@ struct ext4_map_blocks { }; /* - * For delayed allocation tracking - */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_DIRECT 0x0004 +#define EXT4_IO_END_DIRECT 0x0002 /* - * For converting uninitialized extents on a work queue. + * For converting uninitialized extents on a work queue. 'handle' is used for + * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { @@ -581,11 +571,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 /* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 - -/* * ioctl commands */ #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS @@ -879,6 +864,7 @@ struct ext4_inode_info { rwlock_t i_es_lock; struct list_head i_es_lru; unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -903,12 +889,22 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed IOs that might need unwritten extents handling */ - struct list_head i_completed_io_list; + /* Lock protecting lists below */ spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + /* + * Completed IOs that need unwritten extents handling and don't have + * transaction reserved + */ + struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ - struct work_struct i_unwritten_work; /* deferred extent conversion */ + struct work_struct i_rsv_conversion_work; + struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -1245,7 +1241,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_max_writeback_mb_bump; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -1281,8 +1276,10 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; + /* workqueue for unreserved extent convertions (dio) */ + struct workqueue_struct *unrsv_conversion_wq; + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; @@ -1307,6 +1304,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; + unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; @@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + /* Writeback has to have coversion transaction reserved */ + WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && + !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags); +extern int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); |