diff options
Diffstat (limited to 'fs/ext3/inode.c')
| -rw-r--r-- | fs/ext3/inode.c | 510 | 
1 files changed, 292 insertions, 218 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index a9580617edd..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,26 +22,18 @@   *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000   */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h>  #include <linux/highuid.h> -#include <linux/pagemap.h>  #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h>  #include <linux/writeback.h>  #include <linux/mpage.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/fiemap.h>  #include <linux/namei.h> +#include <linux/aio.h> +#include "ext3.h"  #include "xattr.h"  #include "acl.h"  static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from);  /*   * Test whether an inode is a fast symlink. @@ -70,6 +62,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,  	might_sleep(); +	trace_ext3_forget(inode, is_metadata, blocknr);  	BUFFER_TRACE(bh, "enter");  	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +187,52 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)   */  void ext3_evict_inode (struct inode *inode)  { +	struct ext3_inode_info *ei = EXT3_I(inode);  	struct ext3_block_alloc_info *rsv;  	handle_t *handle;  	int want_delete = 0; +	trace_ext3_evict_inode(inode);  	if (!inode->i_nlink && !is_bad_inode(inode)) {  		dquot_initialize(inode);  		want_delete = 1;  	} -	truncate_inode_pages(&inode->i_data, 0); +	/* +	 * When journalling data dirty buffers are tracked only in the journal. +	 * So although mm thinks everything is clean and ready for reaping the +	 * inode might still have some pages to write in the running +	 * transaction or waiting to be checkpointed. Thus calling +	 * journal_invalidatepage() (via truncate_inode_pages()) to discard +	 * these buffers can cause data loss. Also even if we did not discard +	 * these buffers, we would have no way to find them after the inode +	 * is reaped and thus user could see stale data if he tries to read +	 * them before the transaction is checkpointed. So be careful and +	 * force everything to disk here... We use ei->i_datasync_tid to +	 * store the newest transaction containing inode's data. +	 * +	 * Note that directories do not have this problem because they don't +	 * use page cache. +	 * +	 * The s_journal check handles the case when ext3_get_journal() fails +	 * and puts the journal inode. +	 */ +	if (inode->i_nlink && ext3_should_journal_data(inode) && +	    EXT3_SB(inode->i_sb)->s_journal && +	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && +	    inode->i_ino != EXT3_JOURNAL_INO) { +		tid_t commit_tid = atomic_read(&ei->i_datasync_tid); +		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + +		log_start_commit(journal, commit_tid); +		log_wait_commit(journal, commit_tid); +		filemap_write_and_wait(&inode->i_data); +	} +	truncate_inode_pages_final(&inode->i_data);  	ext3_discard_reservation(inode); -	rsv = EXT3_I(inode)->i_block_alloc_info; -	EXT3_I(inode)->i_block_alloc_info = NULL; +	rsv = ei->i_block_alloc_info; +	ei->i_block_alloc_info = NULL;  	if (unlikely(rsv))  		kfree(rsv); @@ -231,15 +256,13 @@ void ext3_evict_inode (struct inode *inode)  	if (inode->i_blocks)  		ext3_truncate(inode);  	/* -	 * Kill off the orphan record which ext3_truncate created. -	 * AKPM: I think this can be inside the above `if'. -	 * Note that ext3_orphan_del() has to be able to cope with the -	 * deletion of a non-existent orphan - this is because we don't -	 * know if ext3_truncate() actually created an orphan record. -	 * (Well, we could do this if we need to, but heck - it works) +	 * Kill off the orphan record created when the inode lost the last +	 * link.  Note that ext3_orphan_del() has to be able to cope with the +	 * deletion of a non-existent orphan - ext3_truncate() could +	 * have removed the record.  	 */  	ext3_orphan_del(handle, inode); -	EXT3_I(inode)->i_dtime	= get_seconds(); +	ei->i_dtime = get_seconds();  	/*  	 * One subtle ordering requirement: if anything has gone wrong @@ -251,18 +274,18 @@ void ext3_evict_inode (struct inode *inode)  	if (ext3_mark_inode_dirty(handle, inode)) {  		/* If that failed, just dquot_drop() and be done with that */  		dquot_drop(inode); -		end_writeback(inode); +		clear_inode(inode);  	} else {  		ext3_xattr_delete_inode(handle, inode);  		dquot_free_inode(inode);  		dquot_drop(inode); -		end_writeback(inode); +		clear_inode(inode);  		ext3_free_inode(handle, inode);  	}  	ext3_journal_stop(handle);  	return;  no_delete: -	end_writeback(inode); +	clear_inode(inode);  	dquot_drop(inode);  } @@ -655,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,  		 * parent to disk.  		 */  		bh = sb_getblk(inode->i_sb, new_blocks[n-1]); +		if (unlikely(!bh)) { +			err = -ENOMEM; +			goto failed; +		}  		branch[n].bh = bh;  		lock_buffer(bh);  		BUFFER_TRACE(bh, "call get_create_access"); @@ -696,7 +723,7 @@ failed:  		BUFFER_TRACE(branch[i].bh, "call journal_forget");  		ext3_journal_forget(handle, branch[i].bh);  	} -	for (i = 0; i <indirect_blks; i++) +	for (i = 0; i < indirect_blks; i++)  		ext3_free_blocks(handle, inode, new_blocks[i], 1);  	ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -725,6 +752,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,  	struct ext3_block_alloc_info *block_i;  	ext3_fsblk_t current_block;  	struct ext3_inode_info *ei = EXT3_I(inode); +	struct timespec now;  	block_i = ei->i_block_alloc_info;  	/* @@ -764,9 +792,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,  	}  	/* We are done with atomic stuff, now do the rest of housekeeping */ - -	inode->i_ctime = CURRENT_TIME_SEC; -	ext3_mark_inode_dirty(handle, inode); +	now = CURRENT_TIME_SEC; +	if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { +		inode->i_ctime = now; +		ext3_mark_inode_dirty(handle, inode); +	}  	/* ext3_mark_inode_dirty already updated i_sync_tid */  	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); @@ -842,6 +872,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,  	ext3_fsblk_t first_block = 0; +	trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);  	J_ASSERT(handle != NULL || create == 0);  	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +917,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,  	if (!create || err == -EIO)  		goto cleanup; +	/* +	 * Block out ext3_truncate while we alter the tree +	 */  	mutex_lock(&ei->truncate_mutex);  	/* @@ -934,9 +968,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,  	 */  	count = ext3_blks_to_allocate(partial, indirect_blks,  					maxblocks, blocks_to_boundary); -	/* -	 * Block out ext3_truncate while we alter the tree -	 */  	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,  				offsets + (partial - chain), partial); @@ -970,6 +1001,9 @@ cleanup:  	}  	BUFFER_TRACE(bh_result, "returned");  out: +	trace_ext3_get_blocks_exit(inode, iblock, +				   depth ? le32_to_cpu(chain[depth-1].key) : 0, +				   count, err);  	return err;  } @@ -1043,16 +1077,15 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,  	 * mapped. 0 in case of a HOLE.  	 */  	if (err > 0) { -		if (err > 1) -			WARN_ON(1); +		WARN_ON(err > 1);  		err = 0;  	}  	*errp = err;  	if (!err && buffer_mapped(&dummy)) {  		struct buffer_head *bh;  		bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -		if (!bh) { -			*errp = -EIO; +		if (unlikely(!bh)) { +			*errp = -ENOMEM;  			goto err;  		}  		if (buffer_new(&dummy)) { @@ -1100,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,  	bh = ext3_getblk(handle, inode, block, create, err);  	if (!bh)  		return bh; -	if (buffer_uptodate(bh)) +	if (bh_uptodate_or_lock(bh))  		return bh; -	ll_rw_block(READ_META, 1, &bh); +	get_bh(bh); +	bh->b_end_io = end_buffer_read_sync; +	submit_bh(READ | REQ_META | REQ_PRIO, bh);  	wait_on_buffer(bh);  	if (buffer_uptodate(bh))  		return bh; @@ -1202,6 +1237,16 @@ static void ext3_truncate_failed_write(struct inode *inode)  	ext3_truncate(inode);  } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ +	ext3_block_truncate_page(inode, inode->i_size); +	ext3_truncate(inode); +} +  static int ext3_write_begin(struct file *file, struct address_space *mapping,  				loff_t pos, unsigned len, unsigned flags,  				struct page **pagep, void **fsdata) @@ -1217,6 +1262,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,  	 * we allocate blocks but write fails for some reason */  	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; +	trace_ext3_write_begin(inode, pos, len, flags); +  	index = pos >> PAGE_CACHE_SHIFT;  	from = pos & (PAGE_CACHE_SIZE - 1);  	to = from + len; @@ -1332,6 +1379,7 @@ static int ext3_ordered_write_end(struct file *file,  	unsigned from, to;  	int ret = 0, ret2; +	trace_ext3_ordered_write_end(inode, pos, len, copied);  	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);  	from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1415,7 @@ static int ext3_writeback_write_end(struct file *file,  	struct inode *inode = file->f_mapping->host;  	int ret; +	trace_ext3_writeback_write_end(inode, pos, len, copied);  	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);  	update_file_sizes(inode, pos, copied);  	/* @@ -1391,10 +1440,12 @@ static int ext3_journalled_write_end(struct file *file,  {  	handle_t *handle = ext3_journal_current_handle();  	struct inode *inode = mapping->host; +	struct ext3_inode_info *ei = EXT3_I(inode);  	int ret = 0, ret2;  	int partial = 0;  	unsigned from, to; +	trace_ext3_journalled_write_end(inode, pos, len, copied);  	from = pos & (PAGE_CACHE_SIZE - 1);  	to = from + len; @@ -1419,8 +1470,9 @@ static int ext3_journalled_write_end(struct file *file,  	if (pos + len > inode->i_size && ext3_can_truncate(inode))  		ext3_orphan_add(handle, inode);  	ext3_set_inode_state(inode, EXT3_STATE_JDATA); -	if (inode->i_size > EXT3_I(inode)->i_disksize) { -		EXT3_I(inode)->i_disksize = inode->i_size; +	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); +	if (inode->i_size > ei->i_disksize) { +		ei->i_disksize = inode->i_size;  		ret2 = ext3_mark_inode_dirty(handle, inode);  		if (!ret)  			ret = ret2; @@ -1507,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)  }  /* - * Note that we always start a transaction even if we're not journalling - * data.  This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data.  This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which   * refers to old data.   *   * In all journalling modes block_write_full_page() will start the I/O.   * - * Problem: - * - *	ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - *		ext3_writepage() - * - * Similar for: - * - *	ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block().  We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - *	    non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - *   In journalled data mode, a data buffer may be metadata against the - *   current transaction.  But the same file is part of a shared mapping - *   and someone does a writepage() on it. - * - *   We will move the buffer onto the async_data list, but *after* it has - *   been dirtied. So there's a small window where we have dirty data on - *   BJ_Metadata. - * - *   Note that this only applies to the last partial page in the file.  The - *   bit which block_write_full_page() uses prepare/commit for.  (That's - *   broken code anyway: it's wrong for msync()). - * - *   It's a rare case: affects the final partial page, for journalled data - *   where the file is subject to bith write() and writepage() in the same - *   transction.  To fix it we'll need a custom block_write_full_page(). - *   We'll probably need that anyway for journalling writepage() output. - *   * We don't honour synchronous mounts for writepage().  That would be   * disastrous.  Any write() or metadata operation will sync the fs for   * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here.   */  static int ext3_ordered_writepage(struct page *page,  				struct writeback_control *wbc) @@ -1568,7 +1581,13 @@ static int ext3_ordered_writepage(struct page *page,  	int err;  	J_ASSERT(PageLocked(page)); -	WARN_ON_ONCE(IS_RDONLY(inode)); +	/* +	 * We don't want to warn for emergency remount. The condition is +	 * ordered to avoid dereferencing inode->i_sb in non-error case to +	 * avoid slow-downs. +	 */ +	WARN_ON_ONCE(IS_RDONLY(inode) && +		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));  	/*  	 * We give up here if we're reentered, because it might be for a @@ -1577,6 +1596,7 @@ static int ext3_ordered_writepage(struct page *page,  	if (ext3_journal_current_handle())  		goto out_fail; +	trace_ext3_ordered_writepage(page);  	if (!page_has_buffers(page)) {  		create_empty_buffers(page, inode->i_sb->s_blocksize,  				(1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1614,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,  	 * block_write_full_page() succeeded.  Otherwise they are unmapped,  	 * and generally junk.  	 */ -	if (ret == 0) { -		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, +	if (ret == 0) +		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,  					NULL, journal_dirty_data_fn); -		if (!ret) -			ret = err; -	}  	walk_page_buffers(handle, page_bufs, 0,  			PAGE_CACHE_SIZE, NULL, bput_one);  	err = ext3_journal_stop(handle); @@ -1642,11 +1659,18 @@ static int ext3_writeback_writepage(struct page *page,  	int err;  	J_ASSERT(PageLocked(page)); -	WARN_ON_ONCE(IS_RDONLY(inode)); +	/* +	 * We don't want to warn for emergency remount. The condition is +	 * ordered to avoid dereferencing inode->i_sb in non-error case to +	 * avoid slow-downs. +	 */ +	WARN_ON_ONCE(IS_RDONLY(inode) && +		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));  	if (ext3_journal_current_handle())  		goto out_fail; +	trace_ext3_writeback_writepage(page);  	if (page_has_buffers(page)) {  		if (!walk_page_buffers(NULL, page_buffers(page), 0,  				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1684,18 +1708,25 @@ static int ext3_journalled_writepage(struct page *page,  	int err;  	J_ASSERT(PageLocked(page)); -	WARN_ON_ONCE(IS_RDONLY(inode)); - -	if (ext3_journal_current_handle()) -		goto no_write; - -	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); -	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		goto no_write; -	} +	/* +	 * We don't want to warn for emergency remount. The condition is +	 * ordered to avoid dereferencing inode->i_sb in non-error case to +	 * avoid slow-downs. +	 */ +	WARN_ON_ONCE(IS_RDONLY(inode) && +		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); +	trace_ext3_journalled_writepage(page);  	if (!page_has_buffers(page) || PageChecked(page)) { +		if (ext3_journal_current_handle()) +			goto no_write; + +		handle = ext3_journal_start(inode, +					    ext3_writepage_trans_blocks(inode)); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			goto no_write; +		}  		/*  		 * It's mmapped pagecache.  Add buffers and journal it.  There  		 * doesn't seem much point in redirtying the page here. @@ -1715,18 +1746,21 @@ static int ext3_journalled_writepage(struct page *page,  		if (ret == 0)  			ret = err;  		ext3_set_inode_state(inode, EXT3_STATE_JDATA); +		atomic_set(&EXT3_I(inode)->i_datasync_tid, +			   handle->h_transaction->t_tid);  		unlock_page(page); +		err = ext3_journal_stop(handle); +		if (!ret) +			ret = err;  	} else {  		/* -		 * It may be a page full of checkpoint-mode buffers.  We don't -		 * really know unless we go poke around in the buffer_heads. -		 * But block_write_full_page will do the right thing. +		 * It is a page full of checkpoint-mode buffers. Go and write +		 * them. They should have been already mapped when they went +		 * to the journal so provide NULL get_block function to catch +		 * errors.  		 */ -		ret = block_write_full_page(page, ext3_get_block, wbc); +		ret = block_write_full_page(page, NULL, wbc);  	} -	err = ext3_journal_stop(handle); -	if (!ret) -		ret = err;  out:  	return ret; @@ -1739,6 +1773,7 @@ out_unlock:  static int ext3_readpage(struct file *file, struct page *page)  { +	trace_ext3_readpage(page);  	return mpage_readpage(page, ext3_get_block);  } @@ -1749,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping,  	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);  } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, +				unsigned int length)  {  	journal_t *journal = EXT3_JOURNAL(page->mapping->host); +	trace_ext3_invalidatepage(page, offset, length); +  	/*  	 * If it's a full truncate we just forget about the pending dirtying  	 */ -	if (offset == 0) +	if (offset == 0 && length == PAGE_CACHE_SIZE)  		ClearPageChecked(page); -	journal_invalidatepage(journal, page, offset); +	journal_invalidatepage(journal, page, offset, length);  }  static int ext3_releasepage(struct page *page, gfp_t wait)  {  	journal_t *journal = EXT3_JOURNAL(page->mapping->host); +	trace_ext3_releasepage(page);  	WARN_ON(PageChecked(page));  	if (!page_has_buffers(page))  		return 0; @@ -1782,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)   * VFS code falls back into buffered path in that case so we are safe.   */  static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, -			const struct iovec *iov, loff_t offset, -			unsigned long nr_segs) +			struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; @@ -1791,9 +1829,11 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,  	handle_t *handle;  	ssize_t ret;  	int orphan = 0; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(iter);  	int retries = 0; +	trace_ext3_direct_IO_enter(inode, offset, count, rw); +  	if (rw == WRITE) {  		loff_t final_size = offset + count; @@ -1816,19 +1856,17 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,  	}  retry: -	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, -				 offset, nr_segs, -				 ext3_get_block, NULL); +	ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);  	/*  	 * In case of error extending write may have instantiated a few  	 * blocks outside i_size. Trim these off again.  	 */  	if (unlikely((rw & WRITE) && ret < 0)) {  		loff_t isize = i_size_read(inode); -		loff_t end = offset + iov_length(iov, nr_segs); +		loff_t end = offset + count;  		if (end > isize) -			vmtruncate(inode, isize); +			ext3_truncate_failed_direct_write(inode);  	}  	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))  		goto retry; @@ -1842,8 +1880,10 @@ retry:  			/* This is really bad luck. We've written the data  			 * but cannot extend i_size. Truncate allocated blocks  			 * and pretend the write failed... */ -			ext3_truncate(inode); +			ext3_truncate_failed_direct_write(inode);  			ret = PTR_ERR(handle); +			if (inode->i_nlink) +				ext3_orphan_del(NULL, inode);  			goto out;  		}  		if (inode->i_nlink) @@ -1868,6 +1908,7 @@ retry:  			ret = err;  	}  out: +	trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);  	return ret;  } @@ -1894,7 +1935,6 @@ static const struct address_space_operations ext3_ordered_aops = {  	.readpage		= ext3_readpage,  	.readpages		= ext3_readpages,  	.writepage		= ext3_ordered_writepage, -	.sync_page		= block_sync_page,  	.write_begin		= ext3_write_begin,  	.write_end		= ext3_ordered_write_end,  	.bmap			= ext3_bmap, @@ -1903,6 +1943,7 @@ static const struct address_space_operations ext3_ordered_aops = {  	.direct_IO		= ext3_direct_IO,  	.migratepage		= buffer_migrate_page,  	.is_partially_uptodate  = block_is_partially_uptodate, +	.is_dirty_writeback	= buffer_check_dirty_writeback,  	.error_remove_page	= generic_error_remove_page,  }; @@ -1910,7 +1951,6 @@ static const struct address_space_operations ext3_writeback_aops = {  	.readpage		= ext3_readpage,  	.readpages		= ext3_readpages,  	.writepage		= ext3_writeback_writepage, -	.sync_page		= block_sync_page,  	.write_begin		= ext3_write_begin,  	.write_end		= ext3_writeback_write_end,  	.bmap			= ext3_bmap, @@ -1926,7 +1966,6 @@ static const struct address_space_operations ext3_journalled_aops = {  	.readpage		= ext3_readpage,  	.readpages		= ext3_readpages,  	.writepage		= ext3_journalled_writepage, -	.sync_page		= block_sync_page,  	.write_begin		= ext3_write_begin,  	.write_end		= ext3_journalled_write_end,  	.set_page_dirty		= ext3_journalled_set_page_dirty, @@ -1953,17 +1992,24 @@ void ext3_set_aops(struct inode *inode)   * This required during truncate. We need to physically zero the tail end   * of that block so it doesn't yield old data if the file is later grown.   */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, -		struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from)  {  	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; -	unsigned offset = from & (PAGE_CACHE_SIZE-1); +	unsigned offset = from & (PAGE_CACHE_SIZE - 1);  	unsigned blocksize, iblock, length, pos; -	struct inode *inode = mapping->host; +	struct page *page; +	handle_t *handle = NULL;  	struct buffer_head *bh;  	int err = 0; +	/* Truncated on block boundary - nothing to do */  	blocksize = inode->i_sb->s_blocksize; +	if ((from & (blocksize - 1)) == 0) +		return 0; + +	page = grab_cache_page(inode->i_mapping, index); +	if (!page) +		return -ENOMEM;  	length = blocksize - (offset & (blocksize - 1));  	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1999,20 +2045,30 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,  	if (PageUptodate(page))  		set_buffer_uptodate(bh); -	if (!buffer_uptodate(bh)) { -		err = -EIO; -		ll_rw_block(READ, 1, &bh); -		wait_on_buffer(bh); +	if (!bh_uptodate_or_lock(bh)) { +		err = bh_submit_read(bh);  		/* Uhhuh. Read error. Complain and punt. */ -		if (!buffer_uptodate(bh)) +		if (err)  			goto unlock;  	} +	/* data=writeback mode doesn't need transaction to zero-out data */ +	if (!ext3_should_writeback_data(inode)) { +		/* We journal at most one block */ +		handle = ext3_journal_start(inode, 1); +		if (IS_ERR(handle)) { +			clear_highpage(page); +			flush_dcache_page(page); +			err = PTR_ERR(handle); +			goto unlock; +		} +	} +  	if (ext3_should_journal_data(inode)) {  		BUFFER_TRACE(bh, "get write access");  		err = ext3_journal_get_write_access(handle, bh);  		if (err) -			goto unlock; +			goto stop;  	}  	zero_user(page, offset, length); @@ -2026,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,  			err = ext3_journal_dirty_data(handle, bh);  		mark_buffer_dirty(bh);  	} +stop: +	if (handle) +		ext3_journal_stop(handle);  unlock:  	unlock_page(page); @@ -2058,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)   *   *	When we do truncate() we may have to clean the ends of several   *	indirect blocks but leave the blocks themselves alive. Block is - *	partially truncated if some data below the new i_size is refered + *	partially truncated if some data below the new i_size is referred   *	from it (and it is on the path to the first completely truncated   *	data block, indeed).  We have to free the top of that path along   *	with everything to the right of the path. Since no allocation @@ -2145,13 +2204,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,  	if (try_to_extend_transaction(handle, inode)) {  		if (bh) {  			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -			ext3_journal_dirty_metadata(handle, bh); +			if (ext3_journal_dirty_metadata(handle, bh)) +				return;  		}  		ext3_mark_inode_dirty(handle, inode);  		truncate_restart_transaction(handle, inode);  		if (bh) {  			BUFFER_TRACE(bh, "retaking write access"); -			ext3_journal_get_write_access(handle, bh); +			if (ext3_journal_get_write_access(handle, bh)) +				return;  		}  	} @@ -2185,7 +2246,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,   * @first:	array of block numbers   * @last:	points immediately past the end of array   * - * We are freeing all blocks refered from that array (numbers are stored as + * We are freeing all blocks referred from that array (numbers are stored as   * little-endian 32-bit) and updating @inode->i_blocks appropriately.   *   * We accumulate contiguous runs of blocks to free.  Conveniently, if these @@ -2273,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,   *	@last:	pointer immediately past the end of array   *	@depth:	depth of the branches to free   * - *	We are freeing all blocks refered from these branches (numbers are + *	We are freeing all blocks referred from these branches (numbers are   *	stored as little-endian 32-bit) and updating @inode->i_blocks   *	appropriately.   */ @@ -2392,8 +2453,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,  int ext3_can_truncate(struct inode *inode)  { -	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) -		return 0;  	if (S_ISREG(inode->i_mode))  		return 1;  	if (S_ISDIR(inode->i_mode)) @@ -2410,7 +2469,7 @@ int ext3_can_truncate(struct inode *inode)   * transaction, and VFS/VM ensures that ext3_truncate() cannot run   * simultaneously on behalf of the same inode.   * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there   * is one core, guiding principle: the file's tree must always be consistent on   * disk.  We must be able to restart the truncate after a crash.   * @@ -2437,7 +2496,6 @@ void ext3_truncate(struct inode *inode)  	struct ext3_inode_info *ei = EXT3_I(inode);  	__le32 *i_data = ei->i_data;  	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); -	struct address_space *mapping = inode->i_mapping;  	int offsets[4];  	Indirect chain[4];  	Indirect *partial; @@ -2445,7 +2503,8 @@ void ext3_truncate(struct inode *inode)  	int n;  	long last_block;  	unsigned blocksize = inode->i_sb->s_blocksize; -	struct page *page; + +	trace_ext3_truncate_enter(inode);  	if (!ext3_can_truncate(inode))  		goto out_notrans; @@ -2453,37 +2512,12 @@ void ext3_truncate(struct inode *inode)  	if (inode->i_size == 0 && ext3_should_writeback_data(inode))  		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); -	/* -	 * We have to lock the EOF page here, because lock_page() nests -	 * outside journal_start(). -	 */ -	if ((inode->i_size & (blocksize - 1)) == 0) { -		/* Block boundary? Nothing to do */ -		page = NULL; -	} else { -		page = grab_cache_page(mapping, -				inode->i_size >> PAGE_CACHE_SHIFT); -		if (!page) -			goto out_notrans; -	} -  	handle = start_transaction(inode); -	if (IS_ERR(handle)) { -		if (page) { -			clear_highpage(page); -			flush_dcache_page(page); -			unlock_page(page); -			page_cache_release(page); -		} +	if (IS_ERR(handle))  		goto out_notrans; -	}  	last_block = (inode->i_size + blocksize-1)  					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - -	if (page) -		ext3_block_truncate_page(handle, page, mapping, inode->i_size); -  	n = ext3_block_to_path(inode, last_block, offsets, NULL);  	if (n == 0)  		goto out_stop;	/* error */ @@ -2598,6 +2632,7 @@ out_stop:  		ext3_orphan_del(handle, inode);  	ext3_journal_stop(handle); +	trace_ext3_truncate_exit(inode);  	return;  out_notrans:  	/* @@ -2606,6 +2641,7 @@ out_notrans:  	 */  	if (inode->i_nlink)  		ext3_orphan_del(NULL, inode); +	trace_ext3_truncate_exit(inode);  }  static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2659,12 +2695,12 @@ static int __ext3_get_inode_loc(struct inode *inode,  		return -EIO;  	bh = sb_getblk(inode->i_sb, block); -	if (!bh) { +	if (unlikely(!bh)) {  		ext3_error (inode->i_sb, "ext3_get_inode_loc",  				"unable to read inode block - "  				"inode=%lu, block="E3FSBLK,  				 inode->i_ino, block); -		return -EIO; +		return -ENOMEM;  	}  	if (!buffer_uptodate(bh)) {  		lock_buffer(bh); @@ -2713,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode,  			bitmap_bh = sb_getblk(inode->i_sb,  					le32_to_cpu(desc->bg_inode_bitmap)); -			if (!bitmap_bh) +			if (unlikely(!bitmap_bh))  				goto make_io;  			/* @@ -2747,9 +2783,10 @@ make_io:  		 * has in-inode xattrs, or we don't have this inode in memory.  		 * Read the block from disk.  		 */ +		trace_ext3_load_inode(inode);  		get_bh(bh);  		bh->b_end_io = end_buffer_read_sync; -		submit_bh(READ_META, bh); +		submit_bh(READ | REQ_META | REQ_PRIO, bh);  		wait_on_buffer(bh);  		if (!buffer_uptodate(bh)) {  			ext3_error(inode->i_sb, "ext3_get_inode_loc", @@ -2819,6 +2856,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)  	transaction_t *transaction;  	long ret;  	int block; +	uid_t i_uid; +	gid_t i_gid;  	inode = iget_locked(sb, ino);  	if (!inode) @@ -2835,13 +2874,15 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)  	bh = iloc.bh;  	raw_inode = ext3_raw_inode(&iloc);  	inode->i_mode = le16_to_cpu(raw_inode->i_mode); -	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); -	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); +	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);  	if(!(test_opt (inode->i_sb, NO_UID32))) { -		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; -		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; +		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;  	} -	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); +	i_uid_write(inode, i_uid); +	i_gid_write(inode, i_gid); +	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));  	inode->i_size = le32_to_cpu(raw_inode->i_size);  	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);  	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); @@ -2996,6 +3037,10 @@ static int ext3_do_update_inode(handle_t *handle,  	struct ext3_inode_info *ei = EXT3_I(inode);  	struct buffer_head *bh = iloc->bh;  	int err = 0, rc, block; +	int need_datasync = 0; +	__le32 disksize; +	uid_t i_uid; +	gid_t i_gid;  again:  	/* we can't allow multiple procs in here at once, its a bit racey */ @@ -3008,32 +3053,38 @@ again:  	ext3_get_inode_flags(ei);  	raw_inode->i_mode = cpu_to_le16(inode->i_mode); +	i_uid = i_uid_read(inode); +	i_gid = i_gid_read(inode);  	if(!(test_opt(inode->i_sb, NO_UID32))) { -		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); -		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); +		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));  /*   * Fix up interoperability with old kernels. Otherwise, old inodes get   * re-used with the upper 16 bits of the uid/gid intact   */  		if(!ei->i_dtime) {  			raw_inode->i_uid_high = -				cpu_to_le16(high_16_bits(inode->i_uid)); +				cpu_to_le16(high_16_bits(i_uid));  			raw_inode->i_gid_high = -				cpu_to_le16(high_16_bits(inode->i_gid)); +				cpu_to_le16(high_16_bits(i_gid));  		} else {  			raw_inode->i_uid_high = 0;  			raw_inode->i_gid_high = 0;  		}  	} else {  		raw_inode->i_uid_low = -			cpu_to_le16(fs_high2lowuid(inode->i_uid)); +			cpu_to_le16(fs_high2lowuid(i_uid));  		raw_inode->i_gid_low = -			cpu_to_le16(fs_high2lowgid(inode->i_gid)); +			cpu_to_le16(fs_high2lowgid(i_gid));  		raw_inode->i_uid_high = 0;  		raw_inode->i_gid_high = 0;  	}  	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); -	raw_inode->i_size = cpu_to_le32(ei->i_disksize); +	disksize = cpu_to_le32(ei->i_disksize); +	if (disksize != raw_inode->i_size) { +		need_datasync = 1; +		raw_inode->i_size = disksize; +	}  	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);  	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);  	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -3049,8 +3100,11 @@ again:  	if (!S_ISREG(inode->i_mode)) {  		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);  	} else { -		raw_inode->i_size_high = -			cpu_to_le32(ei->i_disksize >> 32); +		disksize = cpu_to_le32(ei->i_disksize >> 32); +		if (disksize != raw_inode->i_size_high) { +			raw_inode->i_size_high = disksize; +			need_datasync = 1; +		}  		if (ei->i_disksize > 0x7fffffffULL) {  			struct super_block *sb = inode->i_sb;  			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -3103,6 +3157,8 @@ again:  	ext3_clear_inode_state(inode, EXT3_STATE_NEW);  	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); +	if (need_datasync) +		atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);  out_brelse:  	brelse (bh);  	ext3_std_error(inode->i_sb, err); @@ -3114,21 +3170,20 @@ out_brelse:   *   * We are called from a few places:   * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.   *   Here, there will be no transaction running. We wait for any running - *   trasnaction to commit. + *   transaction to commit.   * - * - Within sys_sync(), kupdate and such. - *   We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + *   We wait on commit, if told to.   * - * - Within prune_icache() (PF_MEMALLOC == true) - *   Here we simply return.  We can't afford to block kswapd on the - *   journal commit. + * - Within iput_final() -> write_inode_now() + *   We wait on commit, if told to.   *   * In all cases it is actually safe for us to return without doing anything,   * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL + * writeback.   *   * Note that we are absolutely dependent upon all inode dirtiers doing the   * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3140,13 +3195,13 @@ out_brelse:   *	stuff();   *	inode->i_size = expr;   * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost.  Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost.  Plus the inode will no longer be on the + * superblock's dirty inode list.   */  int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)  { -	if (current->flags & PF_MEMALLOC) +	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))  		return 0;  	if (ext3_journal_current_handle()) { @@ -3155,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)  		return -EIO;  	} -	if (wbc->sync_mode != WB_SYNC_ALL) +	/* +	 * No need to force transaction in WB_SYNC_NONE mode. Also +	 * ext3_sync_fs() will force the commit after everything is +	 * written. +	 */ +	if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)  		return 0;  	return ext3_force_commit(inode->i_sb); @@ -3190,8 +3250,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)  	if (is_quota_modification(inode, attr))  		dquot_initialize(inode); -	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || -		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { +	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || +	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {  		handle_t *handle;  		/* (user+group)*(old+new) structure, inode write (sb, @@ -3217,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)  		ext3_journal_stop(handle);  	} +	if (attr->ia_valid & ATTR_SIZE) +		inode_dio_wait(inode); +  	if (S_ISREG(inode->i_mode) &&  	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {  		handle_t *handle; @@ -3228,25 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)  		}  		error = ext3_orphan_add(handle, inode); +		if (error) { +			ext3_journal_stop(handle); +			goto err_out; +		}  		EXT3_I(inode)->i_disksize = attr->ia_size; -		rc = ext3_mark_inode_dirty(handle, inode); -		if (!error) -			error = rc; +		error = ext3_mark_inode_dirty(handle, inode);  		ext3_journal_stop(handle); +		if (error) { +			/* Some hard fs error must have happened. Bail out. */ +			ext3_orphan_del(NULL, inode); +			goto err_out; +		} +		rc = ext3_block_truncate_page(inode, attr->ia_size); +		if (rc) { +			/* Cleanup orphan list and exit */ +			handle = ext3_journal_start(inode, 3); +			if (IS_ERR(handle)) { +				ext3_orphan_del(NULL, inode); +				goto err_out; +			} +			ext3_orphan_del(handle, inode); +			ext3_journal_stop(handle); +			goto err_out; +		}  	}  	if ((attr->ia_valid & ATTR_SIZE) &&  	    attr->ia_size != i_size_read(inode)) { -		rc = vmtruncate(inode, attr->ia_size); -		if (rc) -			goto err_out; +		truncate_setsize(inode, attr->ia_size); +		ext3_truncate(inode);  	}  	setattr_copy(inode, attr);  	mark_inode_dirty(inode);  	if (ia_valid & ATTR_MODE) -		rc = ext3_acl_chmod(inode); +		rc = posix_acl_chmod(inode, inode->i_mode);  err_out:  	ext3_std_error(inode->i_sb, error); @@ -3292,7 +3373,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)  	if (ext3_should_journal_data(inode))  		ret = 3 * (bpp + indirects) + 2;  	else -		ret = 2 * (bpp + indirects) + 2; +		ret = 2 * (bpp + indirects) + indirects + 2;  #ifdef CONFIG_QUOTA  	/* We know that structure was already allocated during dquot_initialize so @@ -3358,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,   * inode out, but prune_icache isn't a user-visible syncing function.   * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)   * we start and wait on commits. - * - * Is this efficient/effective?  Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O.  But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out.  One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory.  It has the desired - * effect.   */  int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)  { @@ -3373,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)  	int err;  	might_sleep(); +	trace_ext3_mark_inode_dirty(inode, _RET_IP_);  	err = ext3_reserve_inode_write(handle, inode, &iloc);  	if (!err)  		err = ext3_mark_iloc_dirty(handle, inode, &iloc); @@ -3393,7 +3467,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)   * so would cause a commit on atime updates, which we don't bother doing.   * We handle synchronous inodes at the highest possible level.   */ -void ext3_dirty_inode(struct inode *inode) +void ext3_dirty_inode(struct inode *inode, int flags)  {  	handle_t *current_handle = ext3_journal_current_handle();  	handle_t *handle;  | 
