diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 385 | 
1 files changed, 215 insertions, 170 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d424d7ac02..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@  #include <linux/slab.h>  #include <linux/ratelimit.h>  #include <linux/aio.h> +#include <linux/bitops.h>  #include "ext4_jbd2.h"  #include "xattr.h" @@ -144,8 +145,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,   */  static int ext4_inode_is_fast_symlink(struct inode *inode)  { -	int ea_blocks = EXT4_I(inode)->i_file_acl ? -		(inode->i_sb->s_blocksize >> 9) : 0; +        int ea_blocks = EXT4_I(inode)->i_file_acl ? +		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + +	if (ext4_has_inline_data(inode)) +		return 0;  	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);  } @@ -214,7 +218,7 @@ void ext4_evict_inode(struct inode *inode)  			jbd2_complete_transaction(journal, commit_tid);  			filemap_write_and_wait(&inode->i_data);  		} -		truncate_inode_pages(&inode->i_data, 0); +		truncate_inode_pages_final(&inode->i_data);  		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  		goto no_delete; @@ -225,7 +229,7 @@ void ext4_evict_inode(struct inode *inode)  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0); -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  	if (is_bad_inode(inode)) @@ -442,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,  	 * could be converted.  	 */  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read((&EXT4_I(inode)->i_data_sem)); +		down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -488,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,   * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping   * based files   * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block,   * the result buffer head is unmapped. If the create ==1, it will make sure   * the buffer head is mapped.   * @@ -503,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  {  	struct extent_status es;  	int retval; +	int ret = 0;  #ifdef ES_AGGRESSIVE_TEST  	struct ext4_map_blocks orig_map; @@ -514,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		  "logical block %lu\n", inode->i_ino, flags, map->m_len,  		  (unsigned long) map->m_lblk); +	/* +	 * ext4_map_blocks returns an int, and m_len is an unsigned int +	 */ +	if (unlikely(map->m_len > INT_MAX)) +		map->m_len = INT_MAX; + +	/* We can handle the block number less than EXT_MAX_BLOCKS */ +	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) +		return -EIO; +  	/* Lookup extent status tree firstly */  	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {  		ext4_es_lru_add(inode); @@ -543,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  	 * file system block.  	 */  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read((&EXT4_I(inode)->i_data_sem)); +		down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -552,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  					     EXT4_GET_BLOCKS_KEEP_SIZE);  	}  	if (retval > 0) { -		int ret;  		unsigned int status;  		if (unlikely(retval != map->m_len)) { @@ -579,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  found:  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -		int ret = check_block_validity(inode, map); +		ret = check_block_validity(inode, map);  		if (ret != 0)  			return ret;  	} @@ -596,7 +610,13 @@ found:  	 * with buffer head unmapped.  	 */  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) -		return retval; +		/* +		 * If we need to convert extent to unwritten +		 * we continue and do the actual work in +		 * ext4_ext_map_blocks() +		 */ +		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) +			return retval;  	/*  	 * Here we clear m_flags because after allocating an new extent, @@ -605,12 +625,12 @@ found:  	map->m_flags &= ~EXT4_MAP_FLAGS;  	/* -	 * New blocks allocate and/or writing to uninitialized extent +	 * New blocks allocate and/or writing to unwritten extent  	 * will possibly result in updating i_data, so we take  	 * the write lock of i_data_sem, and call get_blocks()  	 * with create == 1 flag.  	 */ -	down_write((&EXT4_I(inode)->i_data_sem)); +	down_write(&EXT4_I(inode)->i_data_sem);  	/*  	 * if the caller is from delayed allocation writeout path @@ -652,7 +672,6 @@ found:  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);  	if (retval > 0) { -		int ret;  		unsigned int status;  		if (unlikely(retval != map->m_len)) { @@ -687,7 +706,7 @@ found:  has_zeroout:  	up_write((&EXT4_I(inode)->i_data_sem));  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -		int ret = check_block_validity(inode, map); +		ret = check_block_validity(inode, map);  		if (ret != 0)  			return ret;  	} @@ -906,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle,  	 */  	if (dirty)  		clear_buffer_dirty(bh); +	BUFFER_TRACE(bh, "get write access");  	ret = ext4_journal_get_write_access(handle, bh);  	if (!ret && dirty)  		ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1206,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file,   */  static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)  { -	int retries = 0;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned int md_needed; @@ -1218,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)  	 * in order to allocate nrblocks  	 * worse case is one extent per block  	 */ -repeat:  	spin_lock(&ei->i_block_reservation_lock);  	/*  	 * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1256,6 @@ repeat:  		ei->i_da_metadata_calc_len = save_len;  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock); -		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			cond_resched(); -			goto repeat; -		}  		return -ENOSPC;  	}  	ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1269,6 @@ repeat:   */  static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)  { -	int retries = 0;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned int md_needed; @@ -1277,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)  	 * in order to allocate nrblocks  	 * worse case is one extent per block  	 */ -repeat:  	spin_lock(&ei->i_block_reservation_lock);  	/*  	 * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1309,6 @@ repeat:  		ei->i_da_metadata_calc_len = save_len;  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock); -		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			cond_resched(); -			goto repeat; -		}  		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));  		return -ENOSPC;  	} @@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		ext4_es_lru_add(inode);  		if (ext4_es_is_hole(&es)) {  			retval = 0; -			down_read((&EXT4_I(inode)->i_data_sem)); +			down_read(&EXT4_I(inode)->i_data_sem);  			goto add_delayed;  		} @@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	 * Try to see if we can get the block without requesting a new  	 * file system block.  	 */ -	down_read((&EXT4_I(inode)->i_data_sem)); +	down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_has_inline_data(inode)) {  		/*  		 * We will soon create blocks for this page, and let @@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,  	BUG_ON(!ext4_handle_valid(handle));  	if (inline_data) { +		BUFFER_TRACE(inode_bh, "get write access");  		ret = ext4_journal_get_write_access(handle, inode_bh);  		err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -1784,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page,  		ret = err;  	if (!ext4_has_inline_data(inode)) -		ext4_walk_page_buffers(handle, page_bufs, 0, len, +		ext4_walk_page_buffers(NULL, page_bufs, 0, len,  				       NULL, bput_one);  	ext4_set_inode_state(inode, EXT4_STATE_JDATA);  out: @@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page,  	struct buffer_head *page_bufs = NULL;  	struct inode *inode = page->mapping->host;  	struct ext4_io_submit io_submit; +	bool keep_towrite = false;  	trace_ext4_writepage(page);  	size = i_size_read(inode); @@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page,  			unlock_page(page);  			return 0;  		} +		keep_towrite = true;  	}  	if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page,  		unlock_page(page);  		return -ENOMEM;  	} -	ret = ext4_bio_write_page(&io_submit, page, len, wbc); +	ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);  	ext4_io_submit(&io_submit);  	/* Drop io_end reference we got from init */  	ext4_put_io_end_defer(io_submit.io_end); @@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)  	else  		len = PAGE_CACHE_SIZE;  	clear_page_dirty_for_io(page); -	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); +	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);  	if (!err)  		mpd->wbc->nr_to_write--;  	mpd->first_page++; @@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,   * Scan buffers corresponding to changed extent (we expect corresponding pages   * to be already locked) and update buffer state according to new extent state.   * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents   * and do extent conversion after IO is finished. If the last page is not fully   * mapped, we update @map to the next extent in the last page that needs   * mapping. Otherwise we submit the page for IO. @@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	struct inode *inode = mpd->inode;  	struct ext4_map_blocks *map = &mpd->map;  	int get_blocks_flags; -	int err; +	int err, dioread_nolock;  	trace_ext4_da_write_pages_extent(inode, map);  	/*  	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or -	 * to convert an uninitialized extent to be initialized (in the case +	 * to convert an unwritten extent to be initialized (in the case  	 * where we have written into one or more preallocated blocks).  It is  	 * possible that we're going to need more metadata blocks than  	 * previously reserved. However we must not fail because we're in @@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	 */  	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |  			   EXT4_GET_BLOCKS_METADATA_NOFAIL; -	if (ext4_should_dioread_nolock(inode)) +	dioread_nolock = ext4_should_dioread_nolock(inode); +	if (dioread_nolock)  		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;  	if (map->m_flags & (1 << BH_Delay))  		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);  	if (err < 0)  		return err; -	if (map->m_flags & EXT4_MAP_UNINIT) { +	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {  		if (!mpd->io_submit.io_end->handle &&  		    ext4_handle_valid(handle)) {  			mpd->io_submit.io_end->handle = handle->h_rsv_handle; @@ -2178,6 +2190,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)   *   * @handle - handle for journal operations   * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + *                     is no hope of writing the data. The caller should discard + *                     dirty pages to avoid infinite loops.   *   * The function maps extent starting at mpd->lblk of length mpd->len. If it is   * delayed, blocks are allocated, if it is unwritten, we may need to convert @@ -2240,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,  			return err;  	} while (map->m_len); -	/* Update on-disk size after IO is submitted */ +	/* +	 * Update on-disk size after IO is submitted.  Races with +	 * truncate are avoided by checking i_size under i_data_sem. +	 */  	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;  	if (disksize > EXT4_I(inode)->i_disksize) {  		int err2; - -		ext4_wb_update_i_disksize(inode, disksize); +		loff_t i_size; + +		down_write(&EXT4_I(inode)->i_data_sem); +		i_size = i_size_read(inode); +		if (disksize > i_size) +			disksize = i_size; +		if (disksize > EXT4_I(inode)->i_disksize) +			EXT4_I(inode)->i_disksize = disksize;  		err2 = ext4_mark_inode_dirty(handle, inode); +		up_write(&EXT4_I(inode)->i_data_sem);  		if (err2)  			ext4_error(inode->i_sb,  				   "Failed to mark inode %lu dirty", @@ -2295,6 +2320,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  	struct address_space *mapping = mpd->inode->i_mapping;  	struct pagevec pvec;  	unsigned int nr_pages; +	long left = mpd->wbc->nr_to_write;  	pgoff_t index = mpd->first_page;  	pgoff_t end = mpd->last_page;  	int tag; @@ -2330,6 +2356,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  			if (page->index > end)  				goto out; +			/* +			 * Accumulated enough dirty pages? This doesn't apply +			 * to WB_SYNC_ALL mode. For integrity sync we have to +			 * keep going because someone may be concurrently +			 * dirtying pages, and we might have synced a lot of +			 * newly appeared dirty pages, but have not synced all +			 * of the old dirty pages. +			 */ +			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) +				goto out; +  			/* If we can't merge this page, we are done. */  			if (mpd->map.m_len > 0 && mpd->next_page != page->index)  				goto out; @@ -2364,19 +2401,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  			if (err <= 0)  				goto out;  			err = 0; - -			/* -			 * Accumulated enough dirty pages? This doesn't apply -			 * to WB_SYNC_ALL mode. For integrity sync we have to -			 * keep going because someone may be concurrently -			 * dirtying pages, and we might have synced a lot of -			 * newly appeared dirty pages, but have not synced all -			 * of the old dirty pages. -			 */ -			if (mpd->wbc->sync_mode == WB_SYNC_NONE && -			    mpd->next_page - mpd->first_page >= -							mpd->wbc->nr_to_write) -				goto out; +			left--;  		}  		pagevec_release(&pvec);  		cond_resched(); @@ -2420,16 +2445,15 @@ static int ext4_writepages(struct address_space *mapping,  	 * because that could violate lock ordering on umount  	 */  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) -		return 0; +		goto out_writepages;  	if (ext4_should_journal_data(inode)) {  		struct blk_plug plug; -		int ret;  		blk_start_plug(&plug);  		ret = write_cache_pages(mapping, wbc, __writepage, mapping);  		blk_finish_plug(&plug); -		return ret; +		goto out_writepages;  	}  	/* @@ -2442,8 +2466,10 @@ static int ext4_writepages(struct address_space *mapping,  	 * *never* be called, so if that ever happens, we would want  	 * the stack trace.  	 */ -	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) -		return -EROFS; +	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { +		ret = -EROFS; +		goto out_writepages; +	}  	if (ext4_should_dioread_nolock(inode)) {  		/* @@ -2563,7 +2589,7 @@ retry:  			break;  	}  	blk_finish_plug(&plug); -	if (!ret && !cycled) { +	if (!ret && !cycled && wbc->nr_to_write > 0) {  		cycled = 1;  		mpd.last_page = writeback_index - 1;  		mpd.first_page = 0; @@ -3052,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,   * preallocated extents, and those write extend the file, no need to   * fall back to buffered IO.   * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten   * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten.   *   * The unwritten extents will be converted to written when DIO is completed.   * For async direct IO, since the IO may still pending when return, we @@ -3067,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,   *   */  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, -			      const struct iovec *iov, loff_t offset, -			      unsigned long nr_segs) +			      struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host;  	ssize_t ret; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(iter);  	int overwrite = 0;  	get_block_t *get_block_func = NULL;  	int dio_flags = 0; @@ -3082,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	/* Use the old path for reads and writes beyond i_size. */  	if (rw != WRITE || final_size > inode->i_size) -		return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +		return ext4_ind_direct_IO(rw, iocb, iter, offset);  	BUG_ON(iocb->private == NULL); @@ -3106,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	 * We could direct write to holes and fallocate.  	 *  	 * Allocated blocks to fill the hole are marked as -	 * uninitialized to prevent parallel buffered read to expose +	 * unwritten to prevent parallel buffered read to expose  	 * the stale data before DIO complete the data IO.  	 *  	 * As to previously fallocated extents, ext4 get_block will  	 * just simply mark the buffer mapped but still keep the -	 * extents uninitialized. +	 * extents unwritten.  	 *  	 * For non AIO case, we will convert those unwritten extents  	 * to written after return back from blockdev_direct_IO. @@ -3149,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		dio_flags = DIO_LOCKING;  	}  	ret = __blockdev_direct_IO(rw, iocb, inode, -				   inode->i_sb->s_bdev, iov, -				   offset, nr_segs, +				   inode->i_sb->s_bdev, iter, +				   offset,  				   get_block_func,  				   ext4_end_io_dio,  				   NULL, @@ -3204,11 +3229,11 @@ retake_lock:  }  static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, -			      const struct iovec *iov, loff_t offset, -			      unsigned long nr_segs) +			      struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; +	size_t count = iov_iter_count(iter);  	ssize_t ret;  	/* @@ -3221,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,  	if (ext4_has_inline_data(inode))  		return 0; -	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); +	trace_ext4_direct_IO_enter(inode, offset, count, rw);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) -		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); +		ret = ext4_ext_direct_IO(rw, iocb, iter, offset);  	else -		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); -	trace_ext4_direct_IO_exit(inode, offset, -				iov_length(iov, nr_segs), rw, ret); +		ret = ext4_ind_direct_IO(rw, iocb, iter, offset); +	trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);  	return ret;  } @@ -3320,33 +3344,13 @@ void ext4_set_aops(struct inode *inode)  }  /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, -		struct address_space *mapping, loff_t from) -{ -	unsigned offset = from & (PAGE_CACHE_SIZE-1); -	unsigned length; -	unsigned blocksize; -	struct inode *inode = mapping->host; - -	blocksize = inode->i_sb->s_blocksize; -	length = blocksize - (offset & (blocksize - 1)); - -	return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/*   * ext4_block_zero_page_range() zeros out a mapping of length 'length'   * starting from file offset 'from'.  The range to be zero'd must   * be contained with in one block.  If the specified range exceeds   * the end of the block it will be shortened to end of the block   * that cooresponds to 'from'   */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle,  		struct address_space *mapping, loff_t from, loff_t length)  {  	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3436,6 +3440,26 @@ unlock:  	return err;  } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, +		struct address_space *mapping, loff_t from) +{ +	unsigned offset = from & (PAGE_CACHE_SIZE-1); +	unsigned length; +	unsigned blocksize; +	struct inode *inode = mapping->host; + +	blocksize = inode->i_sb->s_blocksize; +	length = blocksize - (offset & (blocksize - 1)); + +	return ext4_block_zero_page_range(handle, mapping, from, length); +} +  int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,  			     loff_t lstart, loff_t length)  { @@ -3509,12 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	if (!S_ISREG(inode->i_mode))  		return -EOPNOTSUPP; -	if (EXT4_SB(sb)->s_cluster_ratio > 1) { -		/* TODO: Add support for bigalloc file systems */ -		return -EOPNOTSUPP; -	} - -	trace_ext4_punch_hole(inode, offset, length); +	trace_ext4_punch_hole(inode, offset, length, 0);  	/*  	 * Write out all dirty pages to avoid race conditions @@ -3528,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	}  	mutex_lock(&inode->i_mutex); -	/* It's not possible punch hole on append only file */ -	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { -		ret = -EPERM; -		goto out_mutex; -	} -	if (IS_SWAPFILE(inode)) { -		ret = -ETXTBSY; -		goto out_mutex; -	}  	/* No need to punch hole beyond i_size */  	if (offset >= inode->i_size) @@ -3617,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  		ret = ext4_free_hole_blocks(handle, inode, first_block,  					    stop_block); -	ext4_discard_preallocations(inode);  	up_write(&EXT4_I(inode)->i_data_sem);  	if (IS_SYNC(inode))  		ext4_handle_sync(handle); + +	/* Now release the pages again to reduce race window */ +	if (last_block_offset > first_block_offset) +		truncate_pagecache_range(inode, first_block_offset, +					 last_block_offset); +  	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);  	ext4_mark_inode_dirty(handle, inode);  out_stop: @@ -3694,7 +3709,7 @@ void ext4_truncate(struct inode *inode)  	/*  	 * There is a possibility that we're either freeing the inode -	 * or it completely new indode. In those cases we might not +	 * or it's a completely new inode. In those cases we might not  	 * have i_mutex locked because it's not necessary.  	 */  	if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)  void ext4_set_inode_flags(struct inode *inode)  {  	unsigned int flags = EXT4_I(inode)->i_flags; +	unsigned int new_fl = 0; -	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);  	if (flags & EXT4_SYNC_FL) -		inode->i_flags |= S_SYNC; +		new_fl |= S_SYNC;  	if (flags & EXT4_APPEND_FL) -		inode->i_flags |= S_APPEND; +		new_fl |= S_APPEND;  	if (flags & EXT4_IMMUTABLE_FL) -		inode->i_flags |= S_IMMUTABLE; +		new_fl |= S_IMMUTABLE;  	if (flags & EXT4_NOATIME_FL) -		inode->i_flags |= S_NOATIME; +		new_fl |= S_NOATIME;  	if (flags & EXT4_DIRSYNC_FL) -		inode->i_flags |= S_DIRSYNC; +		new_fl |= S_DIRSYNC; +	inode_set_flags(inode, new_fl, +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);  }  /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4164,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);  	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); -	inode->i_version = le32_to_cpu(raw_inode->i_disk_version); -	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { -		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -			inode->i_version |= -			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { +		inode->i_version = le32_to_cpu(raw_inode->i_disk_version); +		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { +			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +				inode->i_version |= +		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; +		}  	}  	ret = 0; @@ -4291,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle,  	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);  	struct ext4_inode_info *ei = EXT4_I(inode);  	struct buffer_head *bh = iloc->bh; +	struct super_block *sb = inode->i_sb;  	int err = 0, rc, block; -	int need_datasync = 0; +	int need_datasync = 0, set_large_file = 0;  	uid_t i_uid;  	gid_t i_gid; -	/* For fields not not tracking in the in-memory inode, +	spin_lock(&ei->i_raw_lock); + +	/* For fields not tracked in the in-memory inode,  	 * initialise them to zero for new inodes. */  	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))  		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4334,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle,  	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);  	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); -	if (ext4_inode_blocks_set(handle, raw_inode, ei)) +	if (ext4_inode_blocks_set(handle, raw_inode, ei)) { +		spin_unlock(&ei->i_raw_lock);  		goto out_brelse; +	}  	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);  	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); -	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != -	    cpu_to_le32(EXT4_OS_HURD)) +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))  		raw_inode->i_file_acl_high =  			cpu_to_le16(ei->i_file_acl >> 32);  	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4348,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle,  		need_datasync = 1;  	}  	if (ei->i_disksize > 0x7fffffffULL) { -		struct super_block *sb = inode->i_sb;  		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,  				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||  				EXT4_SB(sb)->s_es->s_rev_level == -				cpu_to_le32(EXT4_GOOD_OLD_REV)) { -			/* If this is the first large file -			 * created, add a flag to the superblock. -			 */ -			err = ext4_journal_get_write_access(handle, -					EXT4_SB(sb)->s_sbh); -			if (err) -				goto out_brelse; -			ext4_update_dynamic_rev(sb); -			EXT4_SET_RO_COMPAT_FEATURE(sb, -					EXT4_FEATURE_RO_COMPAT_LARGE_FILE); -			ext4_handle_sync(handle); -			err = ext4_handle_dirty_super(handle, sb); -		} +		    cpu_to_le32(EXT4_GOOD_OLD_REV)) +			set_large_file = 1;  	}  	raw_inode->i_generation = cpu_to_le32(inode->i_generation);  	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4384,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle,  			raw_inode->i_block[block] = ei->i_data[block];  	} -	raw_inode->i_disk_version = cpu_to_le32(inode->i_version); -	if (ei->i_extra_isize) { -		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -			raw_inode->i_version_hi = -			cpu_to_le32(inode->i_version >> 32); -		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { +		raw_inode->i_disk_version = cpu_to_le32(inode->i_version); +		if (ei->i_extra_isize) { +			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +				raw_inode->i_version_hi = +					cpu_to_le32(inode->i_version >> 32); +			raw_inode->i_extra_isize = +				cpu_to_le16(ei->i_extra_isize); +		}  	}  	ext4_inode_csum_set(inode, raw_inode, ei); +	spin_unlock(&ei->i_raw_lock); +  	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");  	rc = ext4_handle_dirty_metadata(handle, NULL, bh);  	if (!err)  		err = rc;  	ext4_clear_inode_state(inode, EXT4_STATE_NEW); - +	if (set_large_file) { +		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); +		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); +		if (err) +			goto out_brelse; +		ext4_update_dynamic_rev(sb); +		EXT4_SET_RO_COMPAT_FEATURE(sb, +					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE); +		ext4_handle_sync(handle); +		err = ext4_handle_dirty_super(handle, sb); +	}  	ext4_update_inode_fsync_trans(handle, inode, need_datasync);  out_brelse:  	brelse(bh); @@ -4412,21 +4437,20 @@ out_brelse:   *   * We are called from a few places:   * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.   *   Here, there will be no transaction running. We wait for any running   *   transaction to commit.   * - * - Within sys_sync(), kupdate and such. - *   We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + *   We wait on commit, if told to.   * - * - Within prune_icache() (PF_MEMALLOC == true) - *   Here we simply return.  We can't afford to block kswapd on the - *   journal commit. + * - Within iput_final() -> write_inode_now() + *   We wait on commit, if told to.   *   * In all cases it is actually safe for us to return without doing anything,   * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL + * writeback.   *   * Note that we are absolutely dependent upon all inode dirtiers doing the   * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4438,15 +4462,15 @@ out_brelse:   *	stuff();   *	inode->i_size = expr;   * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost.  Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost.  Plus the inode will no longer be on the + * superblock's dirty inode list.   */  int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  {  	int err; -	if (current->flags & PF_MEMALLOC) +	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))  		return 0;  	if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4456,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  			return -EIO;  		} -		if (wbc->sync_mode != WB_SYNC_ALL) +		/* +		 * No need to force transaction in WB_SYNC_NONE mode. Also +		 * ext4_sync_fs() will force the commit after everything is +		 * written. +		 */ +		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)  			return 0;  		err = ext4_force_commit(inode->i_sb); @@ -4466,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  		err = __ext4_get_inode_loc(inode, &iloc, 0);  		if (err)  			return err; -		if (wbc->sync_mode == WB_SYNC_ALL) +		/* +		 * sync(2) will flush the whole buffer cache. No need to do +		 * it here separately for each inode. +		 */ +		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)  			sync_dirty_buffer(iloc.bh);  		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {  			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, @@ -4594,6 +4627,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  			if (attr->ia_size > sbi->s_bitmap_maxbytes)  				return -EFBIG;  		} + +		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) +			inode_inc_iversion(inode); +  		if (S_ISREG(inode->i_mode) &&  		    (attr->ia_size < inode->i_size)) {  			if (ext4_should_order_data(inode)) { @@ -4671,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  		ext4_orphan_del(NULL, inode);  	if (!rc && (ia_valid & ATTR_MODE)) -		rc = ext4_acl_chmod(inode); +		rc = posix_acl_chmod(inode, inode->i_mode);  err_out:  	ext4_std_error(inode->i_sb, error); @@ -4690,6 +4727,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	generic_fillattr(inode, stat);  	/* +	 * If there is inline data in the inode, the inode will normally not +	 * have data blocks allocated (it may have an external xattr block). +	 * Report at least one sector for such files, so tools like tar, rsync, +	 * others doen't incorrectly think the file is completely sparse. +	 */ +	if (unlikely(ext4_has_inline_data(inode))) +		stat->blocks += (stat->size + 511) >> 9; + +	/*  	 * We can't update i_blocks if the block allocation is delayed  	 * otherwise in the case of system crash before the real block  	 * allocation is done, we will have i_blocks inconsistent with @@ -4700,9 +4746,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	 * blocks for this file.  	 */  	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), -				EXT4_I(inode)->i_reserved_data_blocks); - -	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); +				   EXT4_I(inode)->i_reserved_data_blocks); +	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);  	return 0;  }  | 
