diff options
Diffstat (limited to 'fs/ext4/extents.c')
| -rw-r--r-- | fs/ext4/extents.c | 4207 | 
1 files changed, 2908 insertions, 1299 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0554c48cb1f..4da228a0e6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -29,7 +29,6 @@   *   - smart tree reduction   */ -#include <linux/module.h>  #include <linux/fs.h>  #include <linux/time.h>  #include <linux/jbd2.h> @@ -38,11 +37,81 @@  #include <linux/quotaops.h>  #include <linux/string.h>  #include <linux/slab.h> -#include <linux/falloc.h>  #include <asm/uaccess.h>  #include <linux/fiemap.h>  #include "ext4_jbd2.h"  #include "ext4_extents.h" +#include "xattr.h" + +#include <trace/events/ext4.h> + +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \ +					due to ENOSPC */ +#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */ + +#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */ +#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */ + +static __le32 ext4_extent_block_csum(struct inode *inode, +				     struct ext4_extent_header *eh) +{ +	struct ext4_inode_info *ei = EXT4_I(inode); +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	__u32 csum; + +	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, +			   EXT4_EXTENT_TAIL_OFFSET(eh)); +	return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, +					 struct ext4_extent_header *eh) +{ +	struct ext4_extent_tail *et; + +	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, +		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) +		return 1; + +	et = find_ext4_extent_tail(eh); +	if (et->et_checksum != ext4_extent_block_csum(inode, eh)) +		return 0; +	return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, +				       struct ext4_extent_header *eh) +{ +	struct ext4_extent_tail *et; + +	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, +		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) +		return; + +	et = find_ext4_extent_tail(eh); +	et->et_checksum = ext4_extent_block_csum(inode, eh); +} + +static int ext4_split_extent(handle_t *handle, +				struct inode *inode, +				struct ext4_ext_path *path, +				struct ext4_map_blocks *map, +				int split_flag, +				int flags); + +static int ext4_split_extent_at(handle_t *handle, +			     struct inode *inode, +			     struct ext4_ext_path *path, +			     ext4_lblk_t split, +			     int split_flag, +			     int flags); + +static int ext4_find_delayed_extent(struct inode *inode, +				    struct extent_status *newes);  static int ext4_ext_truncate_extend_restart(handle_t *handle,  					    struct inode *inode, @@ -74,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,  {  	if (path->p_bh) {  		/* path points to block */ +		BUFFER_TRACE(path->p_bh, "get_write_access");  		return ext4_journal_get_write_access(handle, path->p_bh);  	}  	/* path points to leaf/index in inode body */ @@ -87,13 +157,15 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,   *  - ENOMEM   *  - EIO   */ -static int ext4_ext_dirty(handle_t *handle, struct inode *inode, -				struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, +		     struct inode *inode, struct ext4_ext_path *path)  {  	int err;  	if (path->p_bh) { +		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));  		/* path points to block */ -		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); +		err = __ext4_handle_dirty_metadata(where, line, handle, +						   inode, path->p_bh);  	} else {  		/* path points to leaf/index in inode body */  		err = ext4_mark_inode_dirty(handle, inode); @@ -105,23 +177,37 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,  			      struct ext4_ext_path *path,  			      ext4_lblk_t block)  { -	struct ext4_inode_info *ei = EXT4_I(inode); -	ext4_fsblk_t bg_start; -	ext4_fsblk_t last_block; -	ext4_grpblk_t colour; -	ext4_group_t block_group; -	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); -	int depth; -  	if (path) { +		int depth = path->p_depth;  		struct ext4_extent *ex; -		depth = path->p_depth; -		/* try to predict block placement */ +		/* +		 * Try to predict block placement assuming that we are +		 * filling in a file which will eventually be +		 * non-sparse --- i.e., in the case of libbfd writing +		 * an ELF object sections out-of-order but in a way +		 * the eventually results in a contiguous object or +		 * executable file, or some database extending a table +		 * space file.  However, this is actually somewhat +		 * non-ideal if we are writing a sparse file such as +		 * qemu or KVM writing a raw image file that is going +		 * to stay fairly sparse, since it will end up +		 * fragmenting the file system's free space.  Maybe we +		 * should have some hueristics or some way to allow +		 * userspace to pass a hint to file system, +		 * especially if the latter case turns out to be +		 * common. +		 */  		ex = path[depth].p_ext; -		if (ex) -			return (ext4_ext_pblock(ex) + -				(block - le32_to_cpu(ex->ee_block))); +		if (ex) { +			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); +			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + +			if (block > ext_block) +				return ext_pblk + (block - ext_block); +			else +				return ext_pblk - (ext_block - block); +		}  		/* it looks like index is empty;  		 * try to find starting block from index itself */ @@ -130,36 +216,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,  	}  	/* OK. use inode's group */ -	block_group = ei->i_block_group; -	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { -		/* -		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME -		 * block groups per flexgroup, reserve the first block -		 * group for directories and special files.  Regular -		 * files will start at the second block group.  This -		 * tends to speed up directory access and improves -		 * fsck times. -		 */ -		block_group &= ~(flex_size-1); -		if (S_ISREG(inode->i_mode)) -			block_group++; -	} -	bg_start = ext4_group_first_block_no(inode->i_sb, block_group); -	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - -	/* -	 * If we are doing delayed allocation, we don't need take -	 * colour into account. -	 */ -	if (test_opt(inode->i_sb, DELALLOC)) -		return bg_start; - -	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) -		colour = (current->pid % 16) * -			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); -	else -		colour = (current->pid % 16) * ((last_block - bg_start) / 16); -	return bg_start + colour + block; +	return ext4_inode_to_goal_block(inode);  }  /* @@ -168,12 +225,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,  static ext4_fsblk_t  ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,  			struct ext4_ext_path *path, -			struct ext4_extent *ex, int *err) +			struct ext4_extent *ex, int *err, unsigned int flags)  {  	ext4_fsblk_t goal, newblock;  	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); -	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); +	newblock = ext4_new_meta_blocks(handle, inode, goal, flags, +					NULL, err);  	return newblock;  } @@ -183,12 +241,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)  	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))  			/ sizeof(struct ext4_extent); -	if (!check) {  #ifdef AGGRESSIVE_TEST -		if (size > 6) -			size = 6; +	if (!check && size > 6) +		size = 6;  #endif -	}  	return size;  } @@ -198,12 +254,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)  	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))  			/ sizeof(struct ext4_extent_idx); -	if (!check) {  #ifdef AGGRESSIVE_TEST -		if (size > 5) -			size = 5; +	if (!check && size > 5) +		size = 5;  #endif -	}  	return size;  } @@ -214,12 +268,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)  	size = sizeof(EXT4_I(inode)->i_data);  	size -= sizeof(struct ext4_extent_header);  	size /= sizeof(struct ext4_extent); -	if (!check) {  #ifdef AGGRESSIVE_TEST -		if (size > 3) -			size = 3; +	if (!check && size > 3) +		size = 3;  #endif -	}  	return size;  } @@ -230,12 +282,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)  	size = sizeof(EXT4_I(inode)->i_data);  	size -= sizeof(struct ext4_extent_header);  	size /= sizeof(struct ext4_extent_idx); -	if (!check) {  #ifdef AGGRESSIVE_TEST -		if (size > 4) -			size = 4; +	if (!check && size > 4) +		size = 4;  #endif -	}  	return size;  } @@ -244,10 +294,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)   * to allocate @blocks   * Worse case is one block per extent   */ -int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)  {  	struct ext4_inode_info *ei = EXT4_I(inode); -	int idxs, num = 0; +	int idxs;  	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))  		/ sizeof(struct ext4_extent_idx)); @@ -262,6 +312,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)  	 */  	if (ei->i_da_metadata_calc_len &&  	    ei->i_da_metadata_calc_last_lblock+1 == lblock) { +		int num = 0; +  		if ((ei->i_da_metadata_calc_len % idxs) == 0)  			num++;  		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) @@ -308,7 +360,11 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)  {  	ext4_fsblk_t block = ext4_ext_pblock(ext);  	int len = ext4_ext_get_actual_len(ext); +	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); +	ext4_lblk_t last = lblock + len - 1; +	if (lblock > last) +		return 0;  	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);  } @@ -324,8 +380,6 @@ static int ext4_valid_extent_entries(struct inode *inode,  				struct ext4_extent_header *eh,  				int depth)  { -	struct ext4_extent *ext; -	struct ext4_extent_idx *ext_idx;  	unsigned short entries;  	if (eh->eh_entries == 0)  		return 1; @@ -334,15 +388,30 @@ static int ext4_valid_extent_entries(struct inode *inode,  	if (depth == 0) {  		/* leaf entries */ -		ext = EXT_FIRST_EXTENT(eh); +		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); +		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; +		ext4_fsblk_t pblock = 0; +		ext4_lblk_t lblock = 0; +		ext4_lblk_t prev = 0; +		int len = 0;  		while (entries) {  			if (!ext4_valid_extent(inode, ext))  				return 0; + +			/* Check for overlapping extents */ +			lblock = le32_to_cpu(ext->ee_block); +			len = ext4_ext_get_actual_len(ext); +			if ((lblock <= prev) && prev) { +				pblock = ext4_ext_pblock(ext); +				es->s_last_error_block = cpu_to_le64(pblock); +				return 0; +			}  			ext++;  			entries--; +			prev = lblock + len - 1;  		}  	} else { -		ext_idx = EXT_FIRST_INDEX(eh); +		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);  		while (entries) {  			if (!ext4_valid_extent_idx(inode, ext_idx))  				return 0; @@ -355,7 +424,7 @@ static int ext4_valid_extent_entries(struct inode *inode,  static int __ext4_ext_check(const char *function, unsigned int line,  			    struct inode *inode, struct ext4_extent_header *eh, -			    int depth) +			    int depth, ext4_fsblk_t pblk)  {  	const char *error_msg;  	int max = 0; @@ -385,25 +454,158 @@ static int __ext4_ext_check(const char *function, unsigned int line,  		error_msg = "invalid extent entries";  		goto corrupted;  	} +	/* Verify checksum on non-root extent tree nodes */ +	if (ext_depth(inode) != depth && +	    !ext4_extent_block_csum_verify(inode, eh)) { +		error_msg = "extent tree corrupted"; +		goto corrupted; +	}  	return 0;  corrupted:  	ext4_error_inode(inode, function, line, 0, -			"bad header/extent: %s - magic %x, " -			"entries %u, max %u(%u), depth %u(%u)", -			error_msg, le16_to_cpu(eh->eh_magic), -			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), -			max, le16_to_cpu(eh->eh_depth), depth); - +			 "pblk %llu bad header/extent: %s - magic %x, " +			 "entries %u, max %u(%u), depth %u(%u)", +			 (unsigned long long) pblk, error_msg, +			 le16_to_cpu(eh->eh_magic), +			 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), +			 max, le16_to_cpu(eh->eh_depth), depth);  	return -EIO;  } -#define ext4_ext_check(inode, eh, depth)	\ -	__ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk)			\ +	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))  int ext4_ext_check_inode(struct inode *inode)  { -	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); +} + +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, +			 struct inode *inode, ext4_fsblk_t pblk, int depth, +			 int flags) +{ +	struct buffer_head		*bh; +	int				err; + +	bh = sb_getblk(inode->i_sb, pblk); +	if (unlikely(!bh)) +		return ERR_PTR(-ENOMEM); + +	if (!bh_uptodate_or_lock(bh)) { +		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); +		err = bh_submit_read(bh); +		if (err < 0) +			goto errout; +	} +	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) +		return bh; +	err = __ext4_ext_check(function, line, inode, +			       ext_block_hdr(bh), depth, pblk); +	if (err) +		goto errout; +	set_buffer_verified(bh); +	/* +	 * If this is a leaf block, cache all of its entries +	 */ +	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { +		struct ext4_extent_header *eh = ext_block_hdr(bh); +		struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); +		ext4_lblk_t prev = 0; +		int i; + +		for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { +			unsigned int status = EXTENT_STATUS_WRITTEN; +			ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); +			int len = ext4_ext_get_actual_len(ex); + +			if (prev && (prev != lblk)) +				ext4_es_cache_extent(inode, prev, +						     lblk - prev, ~0, +						     EXTENT_STATUS_HOLE); + +			if (ext4_ext_is_unwritten(ex)) +				status = EXTENT_STATUS_UNWRITTEN; +			ext4_es_cache_extent(inode, lblk, len, +					     ext4_ext_pblock(ex), status); +			prev = lblk + len; +		} +	} +	return bh; +errout: +	put_bh(bh); +	return ERR_PTR(err); + +} + +#define read_extent_tree_block(inode, pblk, depth, flags)		\ +	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \ +				 (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ +	struct ext4_inode_info *ei = EXT4_I(inode); +	struct ext4_ext_path *path = NULL; +	struct buffer_head *bh; +	int i = 0, depth, ret = 0; + +	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) +		return 0;	/* not an extent-mapped inode */ + +	down_read(&ei->i_data_sem); +	depth = ext_depth(inode); + +	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), +		       GFP_NOFS); +	if (path == NULL) { +		up_read(&ei->i_data_sem); +		return -ENOMEM; +	} + +	/* Don't cache anything if there are no external extent blocks */ +	if (depth == 0) +		goto out; +	path[0].p_hdr = ext_inode_hdr(inode); +	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); +	if (ret) +		goto out; +	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); +	while (i >= 0) { +		/* +		 * If this is a leaf block or we've reached the end of +		 * the index block, go up +		 */ +		if ((i == depth) || +		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { +			brelse(path[i].p_bh); +			path[i].p_bh = NULL; +			i--; +			continue; +		} +		bh = read_extent_tree_block(inode, +					    ext4_idx_pblock(path[i].p_idx++), +					    depth - i - 1, +					    EXT4_EX_FORCE_CACHE); +		if (IS_ERR(bh)) { +			ret = PTR_ERR(bh); +			break; +		} +		i++; +		path[i].p_bh = bh; +		path[i].p_hdr = ext_block_hdr(bh); +		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); +	} +	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: +	up_read(&ei->i_data_sem); +	ext4_ext_drop_refs(path); +	kfree(path); +	return ret;  }  #ifdef EXT_DEBUG @@ -419,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)  		} else if (path->p_ext) {  			ext_debug("  %d:[%d]%d:%llu ",  				  le32_to_cpu(path->p_ext->ee_block), -				  ext4_ext_is_uninitialized(path->p_ext), +				  ext4_ext_is_unwritten(path->p_ext),  				  ext4_ext_get_actual_len(path->p_ext),  				  ext4_ext_pblock(path->p_ext));  		} else @@ -445,14 +647,48 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)  	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {  		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), -			  ext4_ext_is_uninitialized(ex), +			  ext4_ext_is_unwritten(ex),  			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));  	}  	ext_debug("\n");  } + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, +			ext4_fsblk_t newblock, int level) +{ +	int depth = ext_depth(inode); +	struct ext4_extent *ex; + +	if (depth != level) { +		struct ext4_extent_idx *idx; +		idx = path[level].p_idx; +		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { +			ext_debug("%d: move %d:%llu in new index %llu\n", level, +					le32_to_cpu(idx->ei_block), +					ext4_idx_pblock(idx), +					newblock); +			idx++; +		} + +		return; +	} + +	ex = path[depth].p_ext; +	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { +		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", +				le32_to_cpu(ex->ee_block), +				ext4_ext_pblock(ex), +				ext4_ext_is_unwritten(ex), +				ext4_ext_get_actual_len(ex), +				newblock); +		ex++; +	} +} +  #else  #define ext4_ext_show_path(inode, path)  #define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level)  #endif  void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -496,7 +732,7 @@ ext4_ext_binsearch_idx(struct inode *inode,  	}  	path->p_idx = l - 1; -	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), +	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),  		  ext4_idx_pblock(path->p_idx));  #ifdef CHECK_BINSEARCH @@ -567,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode,  	ext_debug("  -> %d:%llu:[%d]%d ",  			le32_to_cpu(path->p_ext->ee_block),  			ext4_ext_pblock(path->p_ext), -			ext4_ext_is_uninitialized(path->p_ext), +			ext4_ext_is_unwritten(path->p_ext),  			ext4_ext_get_actual_len(path->p_ext));  #ifdef CHECK_BINSEARCH @@ -599,17 +835,17 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)  	eh->eh_magic = EXT4_EXT_MAGIC;  	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));  	ext4_mark_inode_dirty(handle, inode); -	ext4_ext_invalidate_cache(inode);  	return 0;  }  struct ext4_ext_path *  ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, -					struct ext4_ext_path *path) +		     struct ext4_ext_path *path, int flags)  {  	struct ext4_extent_header *eh;  	struct buffer_head *bh;  	short int depth, i, ppos = 0, alloc = 0; +	int ret;  	eh = ext_inode_hdr(inode);  	depth = ext_depth(inode); @@ -628,8 +864,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,  	i = depth;  	/* walk through the tree */  	while (i) { -		int need_to_validate = 0; -  		ext_debug("depth %d: num %d, max %d\n",  			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -638,31 +872,24 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,  		path[ppos].p_depth = i;  		path[ppos].p_ext = NULL; -		bh = sb_getblk(inode->i_sb, path[ppos].p_block); -		if (unlikely(!bh)) +		bh = read_extent_tree_block(inode, path[ppos].p_block, --i, +					    flags); +		if (IS_ERR(bh)) { +			ret = PTR_ERR(bh);  			goto err; -		if (!bh_uptodate_or_lock(bh)) { -			if (bh_submit_read(bh) < 0) { -				put_bh(bh); -				goto err; -			} -			/* validate the extent entries */ -			need_to_validate = 1;  		} +  		eh = ext_block_hdr(bh);  		ppos++;  		if (unlikely(ppos > depth)) {  			put_bh(bh);  			EXT4_ERROR_INODE(inode,  					 "ppos %d > depth %d", ppos, depth); +			ret = -EIO;  			goto err;  		}  		path[ppos].p_bh = bh;  		path[ppos].p_hdr = eh; -		i--; - -		if (need_to_validate && ext4_ext_check(inode, eh, i)) -			goto err;  	}  	path[ppos].p_depth = i; @@ -683,7 +910,7 @@ err:  	ext4_ext_drop_refs(path);  	if (alloc)  		kfree(path); -	return ERR_PTR(-EIO); +	return ERR_PTR(ret);  }  /* @@ -708,42 +935,44 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,  				 logical, le32_to_cpu(curp->p_idx->ei_block));  		return -EIO;  	} -	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + +	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) +			     >= le16_to_cpu(curp->p_hdr->eh_max))) { +		EXT4_ERROR_INODE(inode, +				 "eh_entries %d >= eh_max %d!", +				 le16_to_cpu(curp->p_hdr->eh_entries), +				 le16_to_cpu(curp->p_hdr->eh_max)); +		return -EIO; +	} +  	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {  		/* insert after */ -		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -			len = (len - 1) * sizeof(struct ext4_extent_idx); -			len = len < 0 ? 0 : len; -			ext_debug("insert new index %d after: %llu. " -					"move %d from 0x%p to 0x%p\n", -					logical, ptr, len, -					(curp->p_idx + 1), (curp->p_idx + 2)); -			memmove(curp->p_idx + 2, curp->p_idx + 1, len); -		} +		ext_debug("insert new index %d after: %llu\n", logical, ptr);  		ix = curp->p_idx + 1;  	} else {  		/* insert before */ -		len = len * sizeof(struct ext4_extent_idx); -		len = len < 0 ? 0 : len; -		ext_debug("insert new index %d before: %llu. " -				"move %d from 0x%p to 0x%p\n", -				logical, ptr, len, -				curp->p_idx, (curp->p_idx + 1)); -		memmove(curp->p_idx + 1, curp->p_idx, len); +		ext_debug("insert new index %d before: %llu\n", logical, ptr);  		ix = curp->p_idx;  	} +	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; +	BUG_ON(len < 0); +	if (len > 0) { +		ext_debug("insert new index %d: " +				"move %d indices from 0x%p to 0x%p\n", +				logical, len, ix, ix + 1); +		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); +	} + +	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { +		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); +		return -EIO; +	} +  	ix->ei_block = cpu_to_le32(logical);  	ext4_idx_store_pblock(ix, ptr);  	le16_add_cpu(&curp->p_hdr->eh_entries, 1); -	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) -			     > le16_to_cpu(curp->p_hdr->eh_max))) { -		EXT4_ERROR_INODE(inode, -				 "logical %d == ei_block %d!", -				 logical, le32_to_cpu(curp->p_idx->ei_block)); -		return -EIO; -	}  	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {  		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");  		return -EIO; @@ -766,14 +995,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,   * - initializes subtree   */  static int ext4_ext_split(handle_t *handle, struct inode *inode, -				struct ext4_ext_path *path, -				struct ext4_extent *newext, int at) +			  unsigned int flags, +			  struct ext4_ext_path *path, +			  struct ext4_extent *newext, int at)  {  	struct buffer_head *bh = NULL;  	int depth = ext_depth(inode);  	struct ext4_extent_header *neh;  	struct ext4_extent_idx *fidx; -	struct ext4_extent *ex;  	int i = at, k, m, a;  	ext4_fsblk_t newblock, oldblock;  	__le32 border; @@ -821,7 +1050,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);  	for (a = 0; a < depth - at; a++) {  		newblock = ext4_ext_new_meta_block(handle, inode, path, -						   newext, &err); +						   newext, &err, flags);  		if (newblock == 0)  			goto cleanup;  		ablocks[a] = newblock; @@ -835,8 +1064,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		goto cleanup;  	}  	bh = sb_getblk(inode->i_sb, newblock); -	if (!bh) { -		err = -EIO; +	if (unlikely(!bh)) { +		err = -ENOMEM;  		goto cleanup;  	}  	lock_buffer(bh); @@ -850,7 +1079,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));  	neh->eh_magic = EXT4_EXT_MAGIC;  	neh->eh_depth = 0; -	ex = EXT_FIRST_EXTENT(neh);  	/* move remainder of path[depth] to the new leaf */  	if (unlikely(path[depth].p_hdr->eh_entries != @@ -862,28 +1090,16 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		goto cleanup;  	}  	/* start copy from next extent */ -	/* TODO: we could do it by single memmove */ -	m = 0; -	path[depth].p_ext++; -	while (path[depth].p_ext <= -			EXT_MAX_EXTENT(path[depth].p_hdr)) { -		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", -				le32_to_cpu(path[depth].p_ext->ee_block), -				ext4_ext_pblock(path[depth].p_ext), -				ext4_ext_is_uninitialized(path[depth].p_ext), -				ext4_ext_get_actual_len(path[depth].p_ext), -				newblock); -		/*memmove(ex++, path[depth].p_ext++, -				sizeof(struct ext4_extent)); -		neh->eh_entries++;*/ -		path[depth].p_ext++; -		m++; -	} +	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; +	ext4_ext_show_move(inode, path, newblock, depth);  	if (m) { -		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); +		struct ext4_extent *ex; +		ex = EXT_FIRST_EXTENT(neh); +		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);  		le16_add_cpu(&neh->eh_entries, m);  	} +	ext4_extent_block_csum_set(inode, neh);  	set_buffer_uptodate(bh);  	unlock_buffer(bh); @@ -921,8 +1137,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		oldblock = newblock;  		newblock = ablocks[--a];  		bh = sb_getblk(inode->i_sb, newblock); -		if (!bh) { -			err = -EIO; +		if (unlikely(!bh)) { +			err = -ENOMEM;  			goto cleanup;  		}  		lock_buffer(bh); @@ -942,12 +1158,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  		ext_debug("int.index at %d (block %llu): %u -> %llu\n",  				i, newblock, le32_to_cpu(border), oldblock); -		/* copy indexes */ -		m = 0; -		path[i].p_idx++; -		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, -				EXT_MAX_INDEX(path[i].p_hdr)); +		/* move remainder of path[i] to the new index block */  		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=  					EXT_LAST_INDEX(path[i].p_hdr))) {  			EXT4_ERROR_INODE(inode, @@ -956,23 +1168,17 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,  			err = -EIO;  			goto cleanup;  		} -		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -			ext_debug("%d: move %d:%llu in new index %llu\n", i, -					le32_to_cpu(path[i].p_idx->ei_block), -					ext4_idx_pblock(path[i].p_idx), -					newblock); -			/*memmove(++fidx, path[i].p_idx++, -					sizeof(struct ext4_extent_idx)); -			neh->eh_entries++; -			BUG_ON(neh->eh_entries > neh->eh_max);*/ -			path[i].p_idx++; -			m++; -		} +		/* start copy indexes */ +		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; +		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, +				EXT_MAX_INDEX(path[i].p_hdr)); +		ext4_ext_show_move(inode, path, newblock, i);  		if (m) { -			memmove(++fidx, path[i].p_idx - m, +			memmove(++fidx, path[i].p_idx,  				sizeof(struct ext4_extent_idx) * m);  			le16_add_cpu(&neh->eh_entries, m);  		} +		ext4_extent_block_csum_set(inode, neh);  		set_buffer_uptodate(bh);  		unlock_buffer(bh); @@ -1012,7 +1218,7 @@ cleanup:  		for (i = 0; i < depth; i++) {  			if (!ablocks[i])  				continue; -			ext4_free_blocks(handle, inode, 0, ablocks[i], 1, +			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,  					 EXT4_FREE_BLOCKS_METADATA);  		}  	} @@ -1030,25 +1236,22 @@ cleanup:   *   just created block   */  static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, -					struct ext4_ext_path *path, -					struct ext4_extent *newext) +				 unsigned int flags, +				 struct ext4_extent *newext)  { -	struct ext4_ext_path *curp = path;  	struct ext4_extent_header *neh;  	struct buffer_head *bh;  	ext4_fsblk_t newblock;  	int err = 0; -	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); +	newblock = ext4_ext_new_meta_block(handle, inode, NULL, +		newext, &err, flags);  	if (newblock == 0)  		return err;  	bh = sb_getblk(inode->i_sb, newblock); -	if (!bh) { -		err = -EIO; -		ext4_std_error(inode->i_sb, err); -		return err; -	} +	if (unlikely(!bh)) +		return -ENOMEM;  	lock_buffer(bh);  	err = ext4_journal_get_create_access(handle, bh); @@ -1058,7 +1261,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,  	}  	/* move top-level index/leaf into new block */ -	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); +	memmove(bh->b_data, EXT4_I(inode)->i_data, +		sizeof(EXT4_I(inode)->i_data));  	/* set size of new block */  	neh = ext_block_hdr(bh); @@ -1069,6 +1273,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,  	else  		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));  	neh->eh_magic = EXT4_EXT_MAGIC; +	ext4_extent_block_csum_set(inode, neh);  	set_buffer_uptodate(bh);  	unlock_buffer(bh); @@ -1076,32 +1281,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,  	if (err)  		goto out; -	/* create index in new top-level index: num,max,pointer */ -	err = ext4_ext_get_access(handle, inode, curp); -	if (err) -		goto out; - -	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; -	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); -	curp->p_hdr->eh_entries = cpu_to_le16(1); -	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); - -	if (path[0].p_hdr->eh_depth) -		curp->p_idx->ei_block = -			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; -	else -		curp->p_idx->ei_block = -			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -	ext4_idx_store_pblock(curp->p_idx, newblock); - +	/* Update top-level index: num,max,pointer */  	neh = ext_inode_hdr(inode); +	neh->eh_entries = cpu_to_le16(1); +	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); +	if (neh->eh_depth == 0) { +		/* Root extent block becomes index block */ +		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); +		EXT_FIRST_INDEX(neh)->ei_block = +			EXT_FIRST_EXTENT(neh)->ee_block; +	}  	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",  		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),  		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),  		  ext4_idx_pblock(EXT_FIRST_INDEX(neh))); -	neh->eh_depth = cpu_to_le16(path->p_depth + 1); -	err = ext4_ext_dirty(handle, inode, curp); +	le16_add_cpu(&neh->eh_depth, 1); +	ext4_mark_inode_dirty(handle, inode);  out:  	brelse(bh); @@ -1114,8 +1310,10 @@ out:   * if no free index is found, then it requests in-depth growing.   */  static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, -					struct ext4_ext_path *path, -					struct ext4_extent *newext) +				    unsigned int mb_flags, +				    unsigned int gb_flags, +				    struct ext4_ext_path *path, +				    struct ext4_extent *newext)  {  	struct ext4_ext_path *curp;  	int depth, i, err = 0; @@ -1135,7 +1333,7 @@ repeat:  	if (EXT_HAS_FREE_INDEX(curp)) {  		/* if we found index with free entry, then use that  		 * entry: create all needed subtree and add new leaf */ -		err = ext4_ext_split(handle, inode, path, newext, i); +		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);  		if (err)  			goto out; @@ -1143,12 +1341,12 @@ repeat:  		ext4_ext_drop_refs(path);  		path = ext4_ext_find_extent(inode,  				    (ext4_lblk_t)le32_to_cpu(newext->ee_block), -				    path); +				    path, gb_flags);  		if (IS_ERR(path))  			err = PTR_ERR(path);  	} else {  		/* tree is full, time to grow in depth */ -		err = ext4_ext_grow_indepth(handle, inode, path, newext); +		err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);  		if (err)  			goto out; @@ -1156,7 +1354,7 @@ repeat:  		ext4_ext_drop_refs(path);  		path = ext4_ext_find_extent(inode,  				   (ext4_lblk_t)le32_to_cpu(newext->ee_block), -				    path); +				    path, gb_flags);  		if (IS_ERR(path)) {  			err = PTR_ERR(path);  			goto out; @@ -1220,9 +1418,9 @@ static int ext4_ext_search_left(struct inode *inode,  			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {  				EXT4_ERROR_INODE(inode,  				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", -				  ix != NULL ? ix->ei_block : 0, +				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,  				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? -				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, +		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,  				  depth);  				return -EIO;  			} @@ -1245,13 +1443,14 @@ static int ext4_ext_search_left(struct inode *inode,  /*   * search the closest allocated block to the right for *logical   * and returns it at @logical + it's physical address at @phys - * if *logical is the smallest allocated block, the function + * if *logical is the largest allocated block, the function   * returns 0 at @phys   * return value contains 0 (success) or error code   */  static int ext4_ext_search_right(struct inode *inode,  				 struct ext4_ext_path *path, -				 ext4_lblk_t *logical, ext4_fsblk_t *phys) +				 ext4_lblk_t *logical, ext4_fsblk_t *phys, +				 struct ext4_extent **ret_ex)  {  	struct buffer_head *bh = NULL;  	struct ext4_extent_header *eh; @@ -1293,9 +1492,7 @@ static int ext4_ext_search_right(struct inode *inode,  				return -EIO;  			}  		} -		*logical = le32_to_cpu(ex->ee_block); -		*phys = ext4_ext_pblock(ex); -		return 0; +		goto found_extent;  	}  	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { @@ -1308,9 +1505,7 @@ static int ext4_ext_search_right(struct inode *inode,  	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {  		/* next allocated block in this leaf */  		ex++; -		*logical = le32_to_cpu(ex->ee_block); -		*phys = ext4_ext_pblock(ex); -		return 0; +		goto found_extent;  	}  	/* go up and search for index to the right */ @@ -1330,38 +1525,34 @@ got_index:  	ix++;  	block = ext4_idx_pblock(ix);  	while (++depth < path->p_depth) { -		bh = sb_bread(inode->i_sb, block); -		if (bh == NULL) -			return -EIO; -		eh = ext_block_hdr(bh);  		/* subtract from p_depth to get proper eh_depth */ -		if (ext4_ext_check(inode, eh, path->p_depth - depth)) { -			put_bh(bh); -			return -EIO; -		} +		bh = read_extent_tree_block(inode, block, +					    path->p_depth - depth, 0); +		if (IS_ERR(bh)) +			return PTR_ERR(bh); +		eh = ext_block_hdr(bh);  		ix = EXT_FIRST_INDEX(eh);  		block = ext4_idx_pblock(ix);  		put_bh(bh);  	} -	bh = sb_bread(inode->i_sb, block); -	if (bh == NULL) -		return -EIO; +	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); +	if (IS_ERR(bh)) +		return PTR_ERR(bh);  	eh = ext_block_hdr(bh); -	if (ext4_ext_check(inode, eh, path->p_depth - depth)) { -		put_bh(bh); -		return -EIO; -	}  	ex = EXT_FIRST_EXTENT(eh); +found_extent:  	*logical = le32_to_cpu(ex->ee_block);  	*phys = ext4_ext_pblock(ex); -	put_bh(bh); +	*ret_ex = ex; +	if (bh) +		put_bh(bh);  	return 0;  }  /*   * ext4_ext_next_allocated_block: - * returns allocated block in subsequent extent or EXT_MAX_BLOCK. + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.   * NOTE: it considers block number from index entry as   * allocated block. Thus, index entries have to be consistent   * with leaves. @@ -1375,12 +1566,13 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)  	depth = path->p_depth;  	if (depth == 0 && path->p_ext == NULL) -		return EXT_MAX_BLOCK; +		return EXT_MAX_BLOCKS;  	while (depth >= 0) {  		if (depth == path->p_depth) {  			/* leaf */ -			if (path[depth].p_ext != +			if (path[depth].p_ext && +				path[depth].p_ext !=  					EXT_LAST_EXTENT(path[depth].p_hdr))  			  return le32_to_cpu(path[depth].p_ext[1].ee_block);  		} else { @@ -1392,15 +1584,14 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)  		depth--;  	} -	return EXT_MAX_BLOCK; +	return EXT_MAX_BLOCKS;  }  /*   * ext4_ext_next_leaf_block: - * returns first allocated block from next leaf or EXT_MAX_BLOCK + * returns first allocated block from next leaf or EXT_MAX_BLOCKS   */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, -					struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)  {  	int depth; @@ -1409,7 +1600,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,  	/* zero-tree has no leaf blocks at all */  	if (depth == 0) -		return EXT_MAX_BLOCK; +		return EXT_MAX_BLOCKS;  	/* go to index block */  	depth--; @@ -1422,7 +1613,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,  		depth--;  	} -	return EXT_MAX_BLOCK; +	return EXT_MAX_BLOCKS;  }  /* @@ -1492,20 +1683,17 @@ int  ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  				struct ext4_extent *ex2)  { -	unsigned short ext1_ee_len, ext2_ee_len, max_len; +	unsigned short ext1_ee_len, ext2_ee_len;  	/* -	 * Make sure that either both extents are uninitialized, or -	 * both are _not_. +	 * Make sure that both extents are initialized. We don't merge +	 * unwritten extents so that we can be sure that end_io code has +	 * the extent that was written properly split out and conversion to +	 * initialized is trivial.  	 */ -	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) +	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))  		return 0; -	if (ext4_ext_is_uninitialized(ex1)) -		max_len = EXT_UNINIT_MAX_LEN; -	else -		max_len = EXT_INIT_MAX_LEN; -  	ext1_ee_len = ext4_ext_get_actual_len(ex1);  	ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1518,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  	 * as an RO_COMPAT feature, refuse to merge to extents if  	 * this can result in the top bit of ee_len being set.  	 */ -	if (ext1_ee_len + ext2_ee_len > max_len) +	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) +		return 0; +	if (ext4_ext_is_unwritten(ex1) && +	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || +	     atomic_read(&EXT4_I(inode)->i_unwritten) || +	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))  		return 0;  #ifdef AGGRESSIVE_TEST  	if (ext1_ee_len >= 4) @@ -1537,14 +1730,13 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,   * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns   * 1 if they got merged.   */ -static int ext4_ext_try_to_merge(struct inode *inode, +static int ext4_ext_try_to_merge_right(struct inode *inode,  				 struct ext4_ext_path *path,  				 struct ext4_extent *ex)  {  	struct ext4_extent_header *eh;  	unsigned int depth, len; -	int merge_done = 0; -	int uninitialized = 0; +	int merge_done = 0, unwritten;  	depth = ext_depth(inode);  	BUG_ON(path[depth].p_hdr == NULL); @@ -1554,12 +1746,11 @@ static int ext4_ext_try_to_merge(struct inode *inode,  		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))  			break;  		/* merge with next extent! */ -		if (ext4_ext_is_uninitialized(ex)) -			uninitialized = 1; +		unwritten = ext4_ext_is_unwritten(ex);  		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)  				+ ext4_ext_get_actual_len(ex + 1)); -		if (uninitialized) -			ext4_ext_mark_uninitialized(ex); +		if (unwritten) +			ext4_ext_mark_unwritten(ex);  		if (ex + 1 < EXT_LAST_EXTENT(eh)) {  			len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1577,6 +1768,76 @@ static int ext4_ext_try_to_merge(struct inode *inode,  }  /* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, +				     struct inode *inode, +				     struct ext4_ext_path *path) +{ +	size_t s; +	unsigned max_root = ext4_ext_space_root(inode, 0); +	ext4_fsblk_t blk; + +	if ((path[0].p_depth != 1) || +	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || +	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) +		return; + +	/* +	 * We need to modify the block allocation bitmap and the block +	 * group descriptor to release the extent tree block.  If we +	 * can't get the journal credits, give up. +	 */ +	if (ext4_journal_extend(handle, 2)) +		return; + +	/* +	 * Copy the extent data up to the inode +	 */ +	blk = ext4_idx_pblock(path[0].p_idx); +	s = le16_to_cpu(path[1].p_hdr->eh_entries) * +		sizeof(struct ext4_extent_idx); +	s += sizeof(struct ext4_extent_header); + +	memcpy(path[0].p_hdr, path[1].p_hdr, s); +	path[0].p_depth = 0; +	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + +		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); +	path[0].p_hdr->eh_max = cpu_to_le16(max_root); + +	brelse(path[1].p_bh); +	ext4_free_blocks(handle, inode, NULL, blk, 1, +			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | +			 EXT4_FREE_BLOCKS_RESERVE); +} + +/* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static void ext4_ext_try_to_merge(handle_t *handle, +				  struct inode *inode, +				  struct ext4_ext_path *path, +				  struct ext4_extent *ex) { +	struct ext4_extent_header *eh; +	unsigned int depth; +	int merge_done = 0; + +	depth = ext_depth(inode); +	BUG_ON(path[depth].p_hdr == NULL); +	eh = path[depth].p_hdr; + +	if (ex > EXT_FIRST_EXTENT(eh)) +		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + +	if (!merge_done) +		(void) ext4_ext_try_to_merge_right(inode, path, ex); + +	ext4_ext_try_to_merge_up(handle, inode, path); +} + +/*   * check if a portion of the "newext" extent overlaps with an   * existing extent.   * @@ -1584,7 +1845,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,   * such that there will be no overlap, and then returns 1.   * If there is no overlap found, it returns 0.   */ -static unsigned int ext4_ext_check_overlap(struct inode *inode, +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, +					   struct inode *inode,  					   struct ext4_extent *newext,  					   struct ext4_ext_path *path)  { @@ -1597,7 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,  	depth = ext_depth(inode);  	if (!path[depth].p_ext)  		goto out; -	b2 = le32_to_cpu(path[depth].p_ext->ee_block); +	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));  	/*  	 * get the next allocated block if the extent in the path @@ -1605,13 +1867,14 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,  	 */  	if (b2 < b1) {  		b2 = ext4_ext_next_allocated_block(path); -		if (b2 == EXT_MAX_BLOCK) +		if (b2 == EXT_MAX_BLOCKS)  			goto out; +		b2 = EXT4_LBLK_CMASK(sbi, b2);  	}  	/* check for wrap through zero on extent logical start block*/  	if (b1 + len1 < b1) { -		len1 = EXT_MAX_BLOCK - b1; +		len1 = EXT_MAX_BLOCKS - b1;  		newext->ee_len = cpu_to_le16(len1);  		ret = 1;  	} @@ -1633,7 +1896,7 @@ out:   */  int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,  				struct ext4_ext_path *path, -				struct ext4_extent *newext, int flag) +				struct ext4_extent *newext, int gb_flags)  {  	struct ext4_extent_header *eh;  	struct ext4_extent *ex, *fex; @@ -1641,7 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,  	struct ext4_ext_path *npath = NULL;  	int depth, len, err;  	ext4_lblk_t next; -	unsigned uninitialized = 0; +	int mb_flags = 0, unwritten;  	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {  		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1649,42 +1912,88 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,  	}  	depth = ext_depth(inode);  	ex = path[depth].p_ext; +	eh = path[depth].p_hdr;  	if (unlikely(path[depth].p_hdr == NULL)) {  		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);  		return -EIO;  	}  	/* try to insert block into found extent and return */ -	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) -		&& ext4_can_extents_be_merged(inode, ex, newext)) { -		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", -			  ext4_ext_is_uninitialized(newext), -			  ext4_ext_get_actual_len(newext), -			  le32_to_cpu(ex->ee_block), -			  ext4_ext_is_uninitialized(ex), -			  ext4_ext_get_actual_len(ex), -			  ext4_ext_pblock(ex)); -		err = ext4_ext_get_access(handle, inode, path + depth); -		if (err) -			return err; +	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {  		/* -		 * ext4_can_extents_be_merged should have checked that either -		 * both extents are uninitialized, or both aren't. Thus we -		 * need to check only one of them here. +		 * Try to see whether we should rather test the extent on +		 * right from ex, or from the left of ex. This is because +		 * ext4_ext_find_extent() can return either extent on the +		 * left, or on the right from the searched position. This +		 * will make merging more effective.  		 */ -		if (ext4_ext_is_uninitialized(ex)) -			uninitialized = 1; -		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) +		if (ex < EXT_LAST_EXTENT(eh) && +		    (le32_to_cpu(ex->ee_block) + +		    ext4_ext_get_actual_len(ex) < +		    le32_to_cpu(newext->ee_block))) { +			ex += 1; +			goto prepend; +		} else if ((ex > EXT_FIRST_EXTENT(eh)) && +			   (le32_to_cpu(newext->ee_block) + +			   ext4_ext_get_actual_len(newext) < +			   le32_to_cpu(ex->ee_block))) +			ex -= 1; + +		/* Try to append newex to the ex */ +		if (ext4_can_extents_be_merged(inode, ex, newext)) { +			ext_debug("append [%d]%d block to %u:[%d]%d" +				  "(from %llu)\n", +				  ext4_ext_is_unwritten(newext), +				  ext4_ext_get_actual_len(newext), +				  le32_to_cpu(ex->ee_block), +				  ext4_ext_is_unwritten(ex), +				  ext4_ext_get_actual_len(ex), +				  ext4_ext_pblock(ex)); +			err = ext4_ext_get_access(handle, inode, +						  path + depth); +			if (err) +				return err; +			unwritten = ext4_ext_is_unwritten(ex); +			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)  					+ ext4_ext_get_actual_len(newext)); -		if (uninitialized) -			ext4_ext_mark_uninitialized(ex); -		eh = path[depth].p_hdr; -		nearex = ex; -		goto merge; +			if (unwritten) +				ext4_ext_mark_unwritten(ex); +			eh = path[depth].p_hdr; +			nearex = ex; +			goto merge; +		} + +prepend: +		/* Try to prepend newex to the ex */ +		if (ext4_can_extents_be_merged(inode, newext, ex)) { +			ext_debug("prepend %u[%d]%d block to %u:[%d]%d" +				  "(from %llu)\n", +				  le32_to_cpu(newext->ee_block), +				  ext4_ext_is_unwritten(newext), +				  ext4_ext_get_actual_len(newext), +				  le32_to_cpu(ex->ee_block), +				  ext4_ext_is_unwritten(ex), +				  ext4_ext_get_actual_len(ex), +				  ext4_ext_pblock(ex)); +			err = ext4_ext_get_access(handle, inode, +						  path + depth); +			if (err) +				return err; + +			unwritten = ext4_ext_is_unwritten(ex); +			ex->ee_block = newext->ee_block; +			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); +			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) +					+ ext4_ext_get_actual_len(newext)); +			if (unwritten) +				ext4_ext_mark_unwritten(ex); +			eh = path[depth].p_hdr; +			nearex = ex; +			goto merge; +		}  	} -repeat:  	depth = ext_depth(inode);  	eh = path[depth].p_hdr;  	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1692,21 +2001,22 @@ repeat:  	/* probably next leaf has space for us? */  	fex = EXT_LAST_EXTENT(eh); -	next = ext4_ext_next_leaf_block(inode, path); -	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) -	    && next != EXT_MAX_BLOCK) { -		ext_debug("next leaf block - %d\n", next); +	next = EXT_MAX_BLOCKS; +	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) +		next = ext4_ext_next_leaf_block(path); +	if (next != EXT_MAX_BLOCKS) { +		ext_debug("next leaf block - %u\n", next);  		BUG_ON(npath != NULL); -		npath = ext4_ext_find_extent(inode, next, NULL); +		npath = ext4_ext_find_extent(inode, next, NULL, 0);  		if (IS_ERR(npath))  			return PTR_ERR(npath);  		BUG_ON(npath->p_depth != path->p_depth);  		eh = npath[depth].p_hdr;  		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { -			ext_debug("next leaf isnt full(%d)\n", +			ext_debug("next leaf isn't full(%d)\n",  				  le16_to_cpu(eh->eh_entries));  			path = npath; -			goto repeat; +			goto has_space;  		}  		ext_debug("next leaf has no free space(%d,%d)\n",  			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -1716,7 +2026,10 @@ repeat:  	 * There is no free space in the found leaf.  	 * We're gonna add a new leaf in the tree.  	 */ -	err = ext4_ext_create_new_leaf(handle, inode, path, newext); +	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) +		mb_flags = EXT4_MB_USE_RESERVED; +	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, +				       path, newext);  	if (err)  		goto cleanup;  	depth = ext_depth(inode); @@ -1731,94 +2044,103 @@ has_space:  	if (!nearex) {  		/* there is no extent in this leaf, create first one */ -		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", +		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",  				le32_to_cpu(newext->ee_block),  				ext4_ext_pblock(newext), -				ext4_ext_is_uninitialized(newext), +				ext4_ext_is_unwritten(newext),  				ext4_ext_get_actual_len(newext)); -		path[depth].p_ext = EXT_FIRST_EXTENT(eh); -	} else if (le32_to_cpu(newext->ee_block) +		nearex = EXT_FIRST_EXTENT(eh); +	} else { +		if (le32_to_cpu(newext->ee_block)  			   > le32_to_cpu(nearex->ee_block)) { -/*		BUG_ON(newext->ee_block == nearex->ee_block); */ -		if (nearex != EXT_LAST_EXTENT(eh)) { -			len = EXT_MAX_EXTENT(eh) - nearex; -			len = (len - 1) * sizeof(struct ext4_extent); -			len = len < 0 ? 0 : len; -			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " -					"move %d from 0x%p to 0x%p\n", +			/* Insert after */ +			ext_debug("insert %u:%llu:[%d]%d before: " +					"nearest %p\n",  					le32_to_cpu(newext->ee_block),  					ext4_ext_pblock(newext), -					ext4_ext_is_uninitialized(newext), +					ext4_ext_is_unwritten(newext),  					ext4_ext_get_actual_len(newext), -					nearex, len, nearex + 1, nearex + 2); -			memmove(nearex + 2, nearex + 1, len); +					nearex); +			nearex++; +		} else { +			/* Insert before */ +			BUG_ON(newext->ee_block == nearex->ee_block); +			ext_debug("insert %u:%llu:[%d]%d after: " +					"nearest %p\n", +					le32_to_cpu(newext->ee_block), +					ext4_ext_pblock(newext), +					ext4_ext_is_unwritten(newext), +					ext4_ext_get_actual_len(newext), +					nearex); +		} +		len = EXT_LAST_EXTENT(eh) - nearex + 1; +		if (len > 0) { +			ext_debug("insert %u:%llu:[%d]%d: " +					"move %d extents from 0x%p to 0x%p\n", +					le32_to_cpu(newext->ee_block), +					ext4_ext_pblock(newext), +					ext4_ext_is_unwritten(newext), +					ext4_ext_get_actual_len(newext), +					len, nearex, nearex + 1); +			memmove(nearex + 1, nearex, +				len * sizeof(struct ext4_extent));  		} -		path[depth].p_ext = nearex + 1; -	} else { -		BUG_ON(newext->ee_block == nearex->ee_block); -		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); -		len = len < 0 ? 0 : len; -		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " -				"move %d from 0x%p to 0x%p\n", -				le32_to_cpu(newext->ee_block), -				ext4_ext_pblock(newext), -				ext4_ext_is_uninitialized(newext), -				ext4_ext_get_actual_len(newext), -				nearex, len, nearex + 1, nearex + 2); -		memmove(nearex + 1, nearex, len); -		path[depth].p_ext = nearex;  	}  	le16_add_cpu(&eh->eh_entries, 1); -	nearex = path[depth].p_ext; +	path[depth].p_ext = nearex;  	nearex->ee_block = newext->ee_block;  	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));  	nearex->ee_len = newext->ee_len;  merge: -	/* try to merge extents to the right */ -	if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) -		ext4_ext_try_to_merge(inode, path, nearex); +	/* try to merge extents */ +	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) +		ext4_ext_try_to_merge(handle, inode, path, nearex); -	/* try to merge extents to the left */  	/* time to correct all indexes above */  	err = ext4_ext_correct_indexes(handle, inode, path);  	if (err)  		goto cleanup; -	err = ext4_ext_dirty(handle, inode, path + depth); +	err = ext4_ext_dirty(handle, inode, path + path->p_depth);  cleanup:  	if (npath) {  		ext4_ext_drop_refs(npath);  		kfree(npath);  	} -	ext4_ext_invalidate_cache(inode);  	return err;  } -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, -			       ext4_lblk_t num, ext_prepare_callback func, -			       void *cbdata) +static int ext4_fill_fiemap_extents(struct inode *inode, +				    ext4_lblk_t block, ext4_lblk_t num, +				    struct fiemap_extent_info *fieinfo)  {  	struct ext4_ext_path *path = NULL; -	struct ext4_ext_cache cbex;  	struct ext4_extent *ex; -	ext4_lblk_t next, start = 0, end = 0; +	struct extent_status es; +	ext4_lblk_t next, next_del, start = 0, end = 0;  	ext4_lblk_t last = block + num; -	int depth, exists, err = 0; - -	BUG_ON(func == NULL); -	BUG_ON(inode == NULL); +	int exists, depth = 0, err = 0; +	unsigned int flags = 0; +	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; -	while (block < last && block != EXT_MAX_BLOCK) { +	while (block < last && block != EXT_MAX_BLOCKS) {  		num = last - block;  		/* find extent for this block */  		down_read(&EXT4_I(inode)->i_data_sem); -		path = ext4_ext_find_extent(inode, block, path); -		up_read(&EXT4_I(inode)->i_data_sem); + +		if (path && ext_depth(inode) != depth) { +			/* depth was changed. we have to realloc path */ +			kfree(path); +			path = NULL; +		} + +		path = ext4_ext_find_extent(inode, block, path, 0);  		if (IS_ERR(path)) { +			up_read(&EXT4_I(inode)->i_data_sem);  			err = PTR_ERR(path);  			path = NULL;  			break; @@ -1826,13 +2148,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,  		depth = ext_depth(inode);  		if (unlikely(path[depth].p_hdr == NULL)) { +			up_read(&EXT4_I(inode)->i_data_sem);  			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);  			err = -EIO;  			break;  		}  		ex = path[depth].p_ext;  		next = ext4_ext_next_allocated_block(path); +		ext4_ext_drop_refs(path); +		flags = 0;  		exists = 0;  		if (!ex) {  			/* there is no extent yet, so try to allocate @@ -1869,42 +2194,75 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,  		BUG_ON(end <= start);  		if (!exists) { -			cbex.ec_block = start; -			cbex.ec_len = end - start; -			cbex.ec_start = 0; -			cbex.ec_type = EXT4_EXT_CACHE_GAP; +			es.es_lblk = start; +			es.es_len = end - start; +			es.es_pblk = 0;  		} else { -			cbex.ec_block = le32_to_cpu(ex->ee_block); -			cbex.ec_len = ext4_ext_get_actual_len(ex); -			cbex.ec_start = ext4_ext_pblock(ex); -			cbex.ec_type = EXT4_EXT_CACHE_EXTENT; +			es.es_lblk = le32_to_cpu(ex->ee_block); +			es.es_len = ext4_ext_get_actual_len(ex); +			es.es_pblk = ext4_ext_pblock(ex); +			if (ext4_ext_is_unwritten(ex)) +				flags |= FIEMAP_EXTENT_UNWRITTEN;  		} -		if (unlikely(cbex.ec_len == 0)) { -			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); -			err = -EIO; -			break; +		/* +		 * Find delayed extent and update es accordingly. We call +		 * it even in !exists case to find out whether es is the +		 * last existing extent or not. +		 */ +		next_del = ext4_find_delayed_extent(inode, &es); +		if (!exists && next_del) { +			exists = 1; +			flags |= (FIEMAP_EXTENT_DELALLOC | +				  FIEMAP_EXTENT_UNKNOWN);  		} -		err = func(inode, path, &cbex, ex, cbdata); -		ext4_ext_drop_refs(path); +		up_read(&EXT4_I(inode)->i_data_sem); -		if (err < 0) +		if (unlikely(es.es_len == 0)) { +			EXT4_ERROR_INODE(inode, "es.es_len == 0"); +			err = -EIO;  			break; +		} -		if (err == EXT_REPEAT) -			continue; -		else if (err == EXT_BREAK) { -			err = 0; -			break; +		/* +		 * This is possible iff next == next_del == EXT_MAX_BLOCKS. +		 * we need to check next == EXT_MAX_BLOCKS because it is +		 * possible that an extent is with unwritten and delayed +		 * status due to when an extent is delayed allocated and +		 * is allocated by fallocate status tree will track both of +		 * them in a extent. +		 * +		 * So we could return a unwritten and delayed extent, and +		 * its block is equal to 'next'. +		 */ +		if (next == next_del && next == EXT_MAX_BLOCKS) { +			flags |= FIEMAP_EXTENT_LAST; +			if (unlikely(next_del != EXT_MAX_BLOCKS || +				     next != EXT_MAX_BLOCKS)) { +				EXT4_ERROR_INODE(inode, +						 "next extent == %u, next " +						 "delalloc extent = %u", +						 next, next_del); +				err = -EIO; +				break; +			}  		} -		if (ext_depth(inode) != depth) { -			/* depth was changed. we have to realloc path */ -			kfree(path); -			path = NULL; +		if (exists) { +			err = fiemap_fill_next_extent(fieinfo, +				(__u64)es.es_lblk << blksize_bits, +				(__u64)es.es_pblk << blksize_bits, +				(__u64)es.es_len << blksize_bits, +				flags); +			if (err < 0) +				break; +			if (err == 1) { +				err = 0; +				break; +			}  		} -		block = cbex.ec_block + cbex.ec_len; +		block = es.es_lblk + es.es_len;  	}  	if (path) { @@ -1915,21 +2273,6 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,  	return err;  } -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, -			__u32 len, ext4_fsblk_t start, int type) -{ -	struct ext4_ext_cache *cex; -	BUG_ON(len == 0); -	spin_lock(&EXT4_I(inode)->i_block_reservation_lock); -	cex = &EXT4_I(inode)->i_cached_extent; -	cex->ec_type = type; -	cex->ec_block = block; -	cex->ec_len = len; -	cex->ec_start = start; -	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} -  /*   * ext4_ext_put_gap_in_cache:   * calculate boundaries of the gap that the requested block fits into @@ -1940,15 +2283,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  				ext4_lblk_t block)  {  	int depth = ext_depth(inode); -	unsigned long len; -	ext4_lblk_t lblock; +	unsigned long len = 0; +	ext4_lblk_t lblock = 0;  	struct ext4_extent *ex;  	ex = path[depth].p_ext;  	if (ex == NULL) { -		/* there is no extent yet, so gap is [0;-] */ -		lblock = 0; -		len = EXT_MAX_BLOCK; +		/* +		 * there is no extent yet, so gap is [0;-] and we +		 * don't cache it +		 */  		ext_debug("cache gap(whole file):");  	} else if (block < le32_to_cpu(ex->ee_block)) {  		lblock = block; @@ -1957,6 +2301,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  				block,  				le32_to_cpu(ex->ee_block),  				 ext4_ext_get_actual_len(ex)); +		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +			ext4_es_insert_extent(inode, lblock, len, ~0, +					      EXTENT_STATUS_HOLE);  	} else if (block >= le32_to_cpu(ex->ee_block)  			+ ext4_ext_get_actual_len(ex)) {  		ext4_lblk_t next; @@ -1970,62 +2317,29 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,  				block);  		BUG_ON(next == lblock);  		len = next - lblock; +		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +			ext4_es_insert_extent(inode, lblock, len, ~0, +					      EXTENT_STATUS_HOLE);  	} else { -		lblock = len = 0;  		BUG();  	}  	ext_debug(" -> %u:%lu\n", lblock, len); -	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); -} - -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, -			struct ext4_extent *ex) -{ -	struct ext4_ext_cache *cex; -	int ret = EXT4_EXT_CACHE_NO; - -	/* -	 * We borrow i_block_reservation_lock to protect i_cached_extent -	 */ -	spin_lock(&EXT4_I(inode)->i_block_reservation_lock); -	cex = &EXT4_I(inode)->i_cached_extent; - -	/* has cache valid data? */ -	if (cex->ec_type == EXT4_EXT_CACHE_NO) -		goto errout; - -	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && -			cex->ec_type != EXT4_EXT_CACHE_EXTENT); -	if (in_range(block, cex->ec_block, cex->ec_len)) { -		ex->ee_block = cpu_to_le32(cex->ec_block); -		ext4_ext_store_pblock(ex, cex->ec_start); -		ex->ee_len = cpu_to_le16(cex->ec_len); -		ext_debug("%u cached by %u:%u:%llu\n", -				block, -				cex->ec_block, cex->ec_len, cex->ec_start); -		ret = cex->ec_type; -	} -errout: -	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -	return ret;  }  /*   * ext4_ext_rm_idx:   * removes index from the index block. - * It's used in truncate case only, thus all requests are for - * last index in the block only.   */  static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, -			struct ext4_ext_path *path) +			struct ext4_ext_path *path, int depth)  {  	int err;  	ext4_fsblk_t leaf;  	/* free index block */ -	path--; +	depth--; +	path = path + depth;  	leaf = ext4_idx_pblock(path->p_idx);  	if (unlikely(path->p_hdr->eh_entries == 0)) {  		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); @@ -2034,13 +2348,35 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,  	err = ext4_ext_get_access(handle, inode, path);  	if (err)  		return err; + +	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { +		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; +		len *= sizeof(struct ext4_extent_idx); +		memmove(path->p_idx, path->p_idx + 1, len); +	} +  	le16_add_cpu(&path->p_hdr->eh_entries, -1);  	err = ext4_ext_dirty(handle, inode, path);  	if (err)  		return err;  	ext_debug("index is empty, remove it, free block %llu\n", leaf); -	ext4_free_blocks(handle, inode, 0, leaf, 1, +	trace_ext4_ext_rm_idx(inode, leaf); + +	ext4_free_blocks(handle, inode, NULL, leaf, 1,  			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + +	while (--depth >= 0) { +		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) +			break; +		path--; +		err = ext4_ext_get_access(handle, inode, path); +		if (err) +			break; +		path->p_idx->ei_block = (path+1)->p_idx->ei_block; +		err = ext4_ext_dirty(handle, inode, path); +		if (err) +			break; +	}  	return err;  } @@ -2067,7 +2403,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,  			 *  need to account for leaf block credit  			 *  			 *  bitmaps and block group descriptor blocks -			 *  and other metadat blocks still need to be +			 *  and other metadata blocks still need to be  			 *  accounted.  			 */  			/* 1 bitmap, 1 block group descriptor */ @@ -2080,22 +2416,26 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,  }  /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents?   * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split.   * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare.   */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents)  {  	int index; -	int depth = ext_depth(inode); +	int depth; + +	/* If we are converting the inline data, only one is needed here. */ +	if (ext4_has_inline_data(inode)) +		return 1; -	if (chunk) +	depth = ext_depth(inode); + +	if (extents <= 1)  		index = depth * 2;  	else  		index = depth * 3; @@ -2103,15 +2443,49 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)  	return index;  } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ +	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) +		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; +	else if (ext4_should_journal_data(inode)) +		return EXT4_FREE_BLOCKS_FORGET; +	return 0; +} +  static int ext4_remove_blocks(handle_t *handle, struct inode *inode, -				struct ext4_extent *ex, -				ext4_lblk_t from, ext4_lblk_t to) +			      struct ext4_extent *ex, +			      long long *partial_cluster, +			      ext4_lblk_t from, ext4_lblk_t to)  { +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	unsigned short ee_len =  ext4_ext_get_actual_len(ex); -	int flags = EXT4_FREE_BLOCKS_FORGET; +	ext4_fsblk_t pblk; +	int flags = get_default_free_blocks_flags(inode); + +	/* +	 * For bigalloc file systems, we never free a partial cluster +	 * at the beginning of the extent.  Instead, we make a note +	 * that we tried freeing the cluster, and check to see if we +	 * need to free it on a subsequent call to ext4_remove_blocks, +	 * or at the end of the ext4_truncate() operation. +	 */ +	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + +	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); +	/* +	 * If we have a partial cluster, and it's different from the +	 * cluster of the last block, we need to explicitly free the +	 * partial cluster here. +	 */ +	pblk = ext4_ext_pblock(ex) + ee_len - 1; +	if ((*partial_cluster > 0) && +	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) { +		ext4_free_blocks(handle, inode, NULL, +				 EXT4_C2B(sbi, *partial_cluster), +				 sbi->s_cluster_ratio, flags); +		*partial_cluster = 0; +	} -	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) -		flags |= EXT4_FREE_BLOCKS_METADATA;  #ifdef EXTENTS_STATS  	{  		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2131,40 +2505,84 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,  	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {  		/* tail removal */  		ext4_lblk_t num; -		ext4_fsblk_t start; +		unsigned int unaligned;  		num = le32_to_cpu(ex->ee_block) + ee_len - from; -		start = ext4_ext_pblock(ex) + ee_len - num; -		ext_debug("free last %u blocks starting %llu\n", num, start); -		ext4_free_blocks(handle, inode, 0, start, num, flags); -	} else if (from == le32_to_cpu(ex->ee_block) -		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { -		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", -			from, to, le32_to_cpu(ex->ee_block), ee_len); -	} else { -		printk(KERN_INFO "strange request: removal(2) " -				"%u-%u from %u:%u\n", -				from, to, le32_to_cpu(ex->ee_block), ee_len); -	} +		pblk = ext4_ext_pblock(ex) + ee_len - num; +		/* +		 * Usually we want to free partial cluster at the end of the +		 * extent, except for the situation when the cluster is still +		 * used by any other extent (partial_cluster is negative). +		 */ +		if (*partial_cluster < 0 && +		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) +			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + +		ext_debug("free last %u blocks starting %llu partial %lld\n", +			  num, pblk, *partial_cluster); +		ext4_free_blocks(handle, inode, NULL, pblk, num, flags); +		/* +		 * If the block range to be freed didn't start at the +		 * beginning of a cluster, and we removed the entire +		 * extent and the cluster is not used by any other extent, +		 * save the partial cluster here, since we might need to +		 * delete if we determine that the truncate operation has +		 * removed all of the blocks in the cluster. +		 * +		 * On the other hand, if we did not manage to free the whole +		 * extent, we have to mark the cluster as used (store negative +		 * cluster number in partial_cluster). +		 */ +		unaligned = EXT4_PBLK_COFF(sbi, pblk); +		if (unaligned && (ee_len == num) && +		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) +			*partial_cluster = EXT4_B2C(sbi, pblk); +		else if (unaligned) +			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); +		else if (*partial_cluster > 0) +			*partial_cluster = 0; +	} else +		ext4_error(sbi->s_sb, "strange request: removal(2) " +			   "%u-%u from %u:%u\n", +			   from, to, le32_to_cpu(ex->ee_block), ee_len);  	return 0;  } + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end", and splits the extents + * if "start" and "end" appear in the same extent + * + * @handle: The journal handle + * @inode:  The files inode + * @path:   The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + *                   has been released from it. It gets negative in case + *                   that the cluster is still used. + * @start:  The first block to remove + * @end:   The last block to remove + */  static int  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, -		struct ext4_ext_path *path, ext4_lblk_t start) +		 struct ext4_ext_path *path, +		 long long *partial_cluster, +		 ext4_lblk_t start, ext4_lblk_t end)  { +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	int err = 0, correct_index = 0;  	int depth = ext_depth(inode), credits;  	struct ext4_extent_header *eh; -	ext4_lblk_t a, b, block; +	ext4_lblk_t a, b;  	unsigned num;  	ext4_lblk_t ex_ee_block;  	unsigned short ex_ee_len; -	unsigned uninitialized = 0; +	unsigned unwritten = 0;  	struct ext4_extent *ex; +	ext4_fsblk_t pblk;  	/* the header must be checked already in ext4_ext_remove_space() */ -	ext_debug("truncate since %u in leaf\n", start); +	ext_debug("truncate since %u in leaf to %u\n", start, end);  	if (!path[depth].p_hdr)  		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);  	eh = path[depth].p_hdr; @@ -2173,51 +2591,85 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		return -EIO;  	}  	/* find where to start removing */ -	ex = EXT_LAST_EXTENT(eh); +	ex = path[depth].p_ext; +	if (!ex) +		ex = EXT_LAST_EXTENT(eh);  	ex_ee_block = le32_to_cpu(ex->ee_block);  	ex_ee_len = ext4_ext_get_actual_len(ex); +	/* +	 * If we're starting with an extent other than the last one in the +	 * node, we need to see if it shares a cluster with the extent to +	 * the right (towards the end of the file). If its leftmost cluster +	 * is this extent's rightmost cluster and it is not cluster aligned, +	 * we'll mark it as a partial that is not to be deallocated. +	 */ + +	if (ex != EXT_LAST_EXTENT(eh)) { +		ext4_fsblk_t current_pblk, right_pblk; +		long long current_cluster, right_cluster; + +		current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; +		current_cluster = (long long)EXT4_B2C(sbi, current_pblk); +		right_pblk = ext4_ext_pblock(ex + 1); +		right_cluster = (long long)EXT4_B2C(sbi, right_pblk); +		if (current_cluster == right_cluster && +			EXT4_PBLK_COFF(sbi, right_pblk)) +			*partial_cluster = -right_cluster; +	} + +	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); +  	while (ex >= EXT_FIRST_EXTENT(eh) &&  			ex_ee_block + ex_ee_len > start) { -		if (ext4_ext_is_uninitialized(ex)) -			uninitialized = 1; +		if (ext4_ext_is_unwritten(ex)) +			unwritten = 1;  		else -			uninitialized = 0; +			unwritten = 0;  		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, -			 uninitialized, ex_ee_len); +			  unwritten, ex_ee_len);  		path[depth].p_ext = ex;  		a = ex_ee_block > start ? ex_ee_block : start; -		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? -			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; +		b = ex_ee_block+ex_ee_len - 1 < end ? +			ex_ee_block+ex_ee_len - 1 : end;  		ext_debug("  border %u:%u\n", a, b); -		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { -			block = 0; -			num = 0; -			BUG(); +		/* If this extent is beyond the end of the hole, skip it */ +		if (end < ex_ee_block) { +			/* +			 * We're going to skip this extent and move to another, +			 * so if this extent is not cluster aligned we have +			 * to mark the current cluster as used to avoid +			 * accidentally freeing it later on +			 */ +			pblk = ext4_ext_pblock(ex); +			if (EXT4_PBLK_COFF(sbi, pblk)) +				*partial_cluster = +					-((long long)EXT4_B2C(sbi, pblk)); +			ex--; +			ex_ee_block = le32_to_cpu(ex->ee_block); +			ex_ee_len = ext4_ext_get_actual_len(ex); +			continue; +		} else if (b != ex_ee_block + ex_ee_len - 1) { +			EXT4_ERROR_INODE(inode, +					 "can not handle truncate %u:%u " +					 "on extent %u:%u", +					 start, end, ex_ee_block, +					 ex_ee_block + ex_ee_len - 1); +			err = -EIO; +			goto out;  		} else if (a != ex_ee_block) {  			/* remove tail of the extent */ -			block = ex_ee_block; -			num = a - block; -		} else if (b != ex_ee_block + ex_ee_len - 1) { -			/* remove head of the extent */ -			block = a; -			num = b - a; -			/* there is no "make a hole" API yet */ -			BUG(); +			num = a - ex_ee_block;  		} else {  			/* remove whole extent: excellent! */ -			block = ex_ee_block;  			num = 0; -			BUG_ON(a != ex_ee_block); -			BUG_ON(b != ex_ee_block + ex_ee_len - 1);  		} -  		/*  		 * 3 for leaf, sb, and inode plus 2 (bmap and group  		 * descriptor) for each block group; assume two block @@ -2239,30 +2691,49 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		if (err)  			goto out; -		err = ext4_remove_blocks(handle, inode, ex, a, b); +		err = ext4_remove_blocks(handle, inode, ex, partial_cluster, +					 a, b);  		if (err)  			goto out; -		if (num == 0) { +		if (num == 0)  			/* this extent is removed; mark slot entirely unused */  			ext4_ext_store_pblock(ex, 0); -			le16_add_cpu(&eh->eh_entries, -1); -		} -		ex->ee_block = cpu_to_le32(block);  		ex->ee_len = cpu_to_le16(num);  		/* -		 * Do not mark uninitialized if all the blocks in the +		 * Do not mark unwritten if all the blocks in the  		 * extent have been removed.  		 */ -		if (uninitialized && num) -			ext4_ext_mark_uninitialized(ex); +		if (unwritten && num) +			ext4_ext_mark_unwritten(ex); +		/* +		 * If the extent was completely released, +		 * we need to remove it from the leaf +		 */ +		if (num == 0) { +			if (end != EXT_MAX_BLOCKS - 1) { +				/* +				 * For hole punching, we need to scoot all the +				 * extents up when an extent is removed so that +				 * we dont have blank extents in the middle +				 */ +				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * +					sizeof(struct ext4_extent)); + +				/* Now get rid of the one at the end */ +				memset(EXT_LAST_EXTENT(eh), 0, +					sizeof(struct ext4_extent)); +			} +			le16_add_cpu(&eh->eh_entries, -1); +		} else if (*partial_cluster > 0) +			*partial_cluster = 0;  		err = ext4_ext_dirty(handle, inode, path + depth);  		if (err)  			goto out; -		ext_debug("new extent: %u:%u:%llu\n", block, num, +		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,  				ext4_ext_pblock(ex));  		ex--;  		ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2272,10 +2743,30 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	if (correct_index && eh->eh_entries)  		err = ext4_ext_correct_indexes(handle, inode, path); +	/* +	 * If there's a partial cluster and at least one extent remains in +	 * the leaf, free the partial cluster if it isn't shared with the +	 * current extent.  If there's a partial cluster and no extents +	 * remain in the leaf, it can't be freed here.  It can only be +	 * freed when it's possible to determine if it's not shared with +	 * any other extent - when the next leaf is processed or when space +	 * removal is complete. +	 */ +	if (*partial_cluster > 0 && eh->eh_entries && +	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != +	     *partial_cluster)) { +		int flags = get_default_free_blocks_flags(inode); + +		ext4_free_blocks(handle, inode, NULL, +				 EXT4_C2B(sbi, *partial_cluster), +				 sbi->s_cluster_ratio, flags); +		*partial_cluster = 0; +	} +  	/* if this leaf is free, then we should  	 * remove it from index block above */  	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -		err = ext4_ext_rm_idx(handle, inode, path + depth); +		err = ext4_ext_rm_idx(handle, inode, path, depth);  out:  	return err; @@ -2302,46 +2793,122 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)  	return 1;  } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, +			  ext4_lblk_t end)  {  	struct super_block *sb = inode->i_sb;  	int depth = ext_depth(inode); -	struct ext4_ext_path *path; +	struct ext4_ext_path *path = NULL; +	long long partial_cluster = 0;  	handle_t *handle; -	int i, err; +	int i = 0, err = 0; -	ext_debug("truncate since %u\n", start); +	ext_debug("truncate since %u to %u\n", start, end);  	/* probably first extent we're gonna free will be last in block */ -	handle = ext4_journal_start(inode, depth + 1); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);  	if (IS_ERR(handle))  		return PTR_ERR(handle);  again: -	ext4_ext_invalidate_cache(inode); +	trace_ext4_ext_remove_space(inode, start, end, depth);  	/* +	 * Check if we are removing extents inside the extent tree. If that +	 * is the case, we are going to punch a hole inside the extent tree +	 * so we have to check whether we need to split the extent covering +	 * the last block to remove so we can easily remove the part of it +	 * in ext4_ext_rm_leaf(). +	 */ +	if (end < EXT_MAX_BLOCKS - 1) { +		struct ext4_extent *ex; +		ext4_lblk_t ee_block; + +		/* find extent for this block */ +		path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); +		if (IS_ERR(path)) { +			ext4_journal_stop(handle); +			return PTR_ERR(path); +		} +		depth = ext_depth(inode); +		/* Leaf not may not exist only if inode has no blocks at all */ +		ex = path[depth].p_ext; +		if (!ex) { +			if (depth) { +				EXT4_ERROR_INODE(inode, +						 "path[%d].p_hdr == NULL", +						 depth); +				err = -EIO; +			} +			goto out; +		} + +		ee_block = le32_to_cpu(ex->ee_block); + +		/* +		 * See if the last block is inside the extent, if so split +		 * the extent at 'end' block so we can easily remove the +		 * tail of the first part of the split extent in +		 * ext4_ext_rm_leaf(). +		 */ +		if (end >= ee_block && +		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) { +			int split_flag = 0; + +			if (ext4_ext_is_unwritten(ex)) +				split_flag = EXT4_EXT_MARK_UNWRIT1 | +					     EXT4_EXT_MARK_UNWRIT2; + +			/* +			 * Split the extent in two so that 'end' is the last +			 * block in the first new extent. Also we should not +			 * fail removing space due to ENOSPC so try to use +			 * reserved block if that happens. +			 */ +			err = ext4_split_extent_at(handle, inode, path, +					end + 1, split_flag, +					EXT4_EX_NOCACHE | +					EXT4_GET_BLOCKS_PRE_IO | +					EXT4_GET_BLOCKS_METADATA_NOFAIL); + +			if (err < 0) +				goto out; +		} +	} +	/*  	 * We start scanning from right side, freeing all the blocks  	 * after i_size and walking into the tree depth-wise.  	 */  	depth = ext_depth(inode); -	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); -	if (path == NULL) { -		ext4_journal_stop(handle); -		return -ENOMEM; -	} -	path[0].p_depth = depth; -	path[0].p_hdr = ext_inode_hdr(inode); -	if (ext4_ext_check(inode, path[0].p_hdr, depth)) { -		err = -EIO; -		goto out; +	if (path) { +		int k = i = depth; +		while (--k > 0) +			path[k].p_block = +				le16_to_cpu(path[k].p_hdr->eh_entries)+1; +	} else { +		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), +			       GFP_NOFS); +		if (path == NULL) { +			ext4_journal_stop(handle); +			return -ENOMEM; +		} +		path[0].p_depth = depth; +		path[0].p_hdr = ext_inode_hdr(inode); +		i = 0; + +		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { +			err = -EIO; +			goto out; +		}  	} -	i = err = 0; +	err = 0;  	while (i >= 0 && err == 0) {  		if (i == depth) {  			/* this is leaf block */ -			err = ext4_ext_rm_leaf(handle, inode, path, start); +			err = ext4_ext_rm_leaf(handle, inode, path, +					       &partial_cluster, start, +					       end);  			/* root level has p_bh == NULL, brelse() eats this */  			brelse(path[i].p_bh);  			path[i].p_bh = NULL; @@ -2376,21 +2943,21 @@ again:  			ext_debug("move to level %d (block %llu)\n",  				  i + 1, ext4_idx_pblock(path[i].p_idx));  			memset(path + i + 1, 0, sizeof(*path)); -			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); -			if (!bh) { +			bh = read_extent_tree_block(inode, +				ext4_idx_pblock(path[i].p_idx), depth - i - 1, +				EXT4_EX_NOCACHE); +			if (IS_ERR(bh)) {  				/* should we reset i_size? */ -				err = -EIO; +				err = PTR_ERR(bh);  				break;  			} +			/* Yield here to deal with large extent trees. +			 * Should be a no-op if we did IO above. */ +			cond_resched();  			if (WARN_ON(i + 1 > depth)) {  				err = -EIO;  				break;  			} -			if (ext4_ext_check(inode, ext_block_hdr(bh), -							depth - i - 1)) { -				err = -EIO; -				break; -			}  			path[i + 1].p_bh = bh;  			/* save actual number of indexes since this @@ -2403,7 +2970,7 @@ again:  				/* index is empty, remove it;  				 * handle must be already prepared by the  				 * truncatei_leaf() */ -				err = ext4_ext_rm_idx(handle, inode, path + i); +				err = ext4_ext_rm_idx(handle, inode, path, i);  			}  			/* root level has p_bh == NULL, brelse() eats this */  			brelse(path[i].p_bh); @@ -2413,6 +2980,21 @@ again:  		}  	} +	trace_ext4_ext_remove_space_done(inode, start, end, depth, +			partial_cluster, path->p_hdr->eh_entries); + +	/* If we still have something in the partial cluster and we have removed +	 * even the first extent, then we should free the blocks in the partial +	 * cluster as well. */ +	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { +		int flags = get_default_free_blocks_flags(inode); + +		ext4_free_blocks(handle, inode, NULL, +				 EXT4_C2B(EXT4_SB(sb), partial_cluster), +				 EXT4_SB(sb)->s_cluster_ratio, flags); +		partial_cluster = 0; +	} +  	/* TODO: flexible tree reduction should be here */  	if (path->p_hdr->eh_entries == 0) {  		/* @@ -2430,8 +3012,10 @@ again:  out:  	ext4_ext_drop_refs(path);  	kfree(path); -	if (err == -EAGAIN) +	if (err == -EAGAIN) { +		path = NULL;  		goto again; +	}  	ext4_journal_stop(handle);  	return err; @@ -2448,17 +3032,17 @@ void ext4_ext_init(struct super_block *sb)  	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {  #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) -		printk(KERN_INFO "EXT4-fs: file extents enabled"); +		printk(KERN_INFO "EXT4-fs: file extents enabled"  #ifdef AGGRESSIVE_TEST -		printk(", aggressive tests"); +		       ", aggressive tests"  #endif  #ifdef CHECK_BINSEARCH -		printk(", check binsearch"); +		       ", check binsearch"  #endif  #ifdef EXTENTS_STATS -		printk(", stats"); +		       ", stats"  #endif -		printk("\n"); +		       "\n");  #endif  #ifdef EXTENTS_STATS  		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2488,6 +3072,23 @@ void ext4_ext_release(struct super_block *sb)  #endif  } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ +	ext4_lblk_t  ee_block; +	ext4_fsblk_t ee_pblock; +	unsigned int ee_len; + +	ee_block  = le32_to_cpu(ex->ee_block); +	ee_len    = ext4_ext_get_actual_len(ex); +	ee_pblock = ext4_ext_pblock(ex); + +	if (ee_len == 0) +		return 0; + +	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, +				     EXTENT_STATUS_WRITTEN); +} +  /* FIXME!! we need to try to merge to left or right after zero-out  */  static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)  { @@ -2505,529 +3106,673 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)  	return ret;  } -#define EXT4_EXT_ZERO_LEN 7 +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + *		 the states(init or unwritten) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + *  a> the extent are splitted into two extent. + *  b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, +			     struct inode *inode, +			     struct ext4_ext_path *path, +			     ext4_lblk_t split, +			     int split_flag, +			     int flags) +{ +	ext4_fsblk_t newblock; +	ext4_lblk_t ee_block; +	struct ext4_extent *ex, newex, orig_ex, zero_ex; +	struct ext4_extent *ex2 = NULL; +	unsigned int ee_len, depth; +	int err = 0; + +	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == +	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + +	ext_debug("ext4_split_extents_at: inode %lu, logical" +		"block %llu\n", inode->i_ino, (unsigned long long)split); + +	ext4_ext_show_leaf(inode, path); + +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	ee_block = le32_to_cpu(ex->ee_block); +	ee_len = ext4_ext_get_actual_len(ex); +	newblock = split - ee_block + ext4_ext_pblock(ex); + +	BUG_ON(split < ee_block || split >= (ee_block + ee_len)); +	BUG_ON(!ext4_ext_is_unwritten(ex) && +	       split_flag & (EXT4_EXT_MAY_ZEROOUT | +			     EXT4_EXT_MARK_UNWRIT1 | +			     EXT4_EXT_MARK_UNWRIT2)); + +	err = ext4_ext_get_access(handle, inode, path + depth); +	if (err) +		goto out; + +	if (split == ee_block) { +		/* +		 * case b: block @split is the block that the extent begins with +		 * then we just change the state of the extent, and splitting +		 * is not needed. +		 */ +		if (split_flag & EXT4_EXT_MARK_UNWRIT2) +			ext4_ext_mark_unwritten(ex); +		else +			ext4_ext_mark_initialized(ex); + +		if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) +			ext4_ext_try_to_merge(handle, inode, path, ex); + +		err = ext4_ext_dirty(handle, inode, path + path->p_depth); +		goto out; +	} + +	/* case a */ +	memcpy(&orig_ex, ex, sizeof(orig_ex)); +	ex->ee_len = cpu_to_le16(split - ee_block); +	if (split_flag & EXT4_EXT_MARK_UNWRIT1) +		ext4_ext_mark_unwritten(ex); + +	/* +	 * path may lead to new leaf, not to original leaf any more +	 * after ext4_ext_insert_extent() returns, +	 */ +	err = ext4_ext_dirty(handle, inode, path + depth); +	if (err) +		goto fix_extent_len; + +	ex2 = &newex; +	ex2->ee_block = cpu_to_le32(split); +	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block)); +	ext4_ext_store_pblock(ex2, newblock); +	if (split_flag & EXT4_EXT_MARK_UNWRIT2) +		ext4_ext_mark_unwritten(ex2); + +	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); +	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { +		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { +			if (split_flag & EXT4_EXT_DATA_VALID1) { +				err = ext4_ext_zeroout(inode, ex2); +				zero_ex.ee_block = ex2->ee_block; +				zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(ex2)); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex2)); +			} else { +				err = ext4_ext_zeroout(inode, ex); +				zero_ex.ee_block = ex->ee_block; +				zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(ex)); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex)); +			} +		} else { +			err = ext4_ext_zeroout(inode, &orig_ex); +			zero_ex.ee_block = orig_ex.ee_block; +			zero_ex.ee_len = cpu_to_le16( +						ext4_ext_get_actual_len(&orig_ex)); +			ext4_ext_store_pblock(&zero_ex, +					      ext4_ext_pblock(&orig_ex)); +		} + +		if (err) +			goto fix_extent_len; +		/* update the extent length and mark as initialized */ +		ex->ee_len = cpu_to_le16(ee_len); +		ext4_ext_try_to_merge(handle, inode, path, ex); +		err = ext4_ext_dirty(handle, inode, path + path->p_depth); +		if (err) +			goto fix_extent_len; + +		/* update extent status tree */ +		err = ext4_zeroout_es(inode, &zero_ex); + +		goto out; +	} else if (err) +		goto fix_extent_len; + +out: +	ext4_ext_show_leaf(inode, path); +	return err; + +fix_extent_len: +	ex->ee_len = orig_ex.ee_len; +	ext4_ext_dirty(handle, inode, path + depth); +	return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (up to three) + * There are three possibilities: + *   a> There is no split required + *   b> Splits in two extents: Split is happening at either end of the extent + *   c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, +			      struct inode *inode, +			      struct ext4_ext_path *path, +			      struct ext4_map_blocks *map, +			      int split_flag, +			      int flags) +{ +	ext4_lblk_t ee_block; +	struct ext4_extent *ex; +	unsigned int ee_len, depth; +	int err = 0; +	int unwritten; +	int split_flag1, flags1; +	int allocated = map->m_len; + +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	ee_block = le32_to_cpu(ex->ee_block); +	ee_len = ext4_ext_get_actual_len(ex); +	unwritten = ext4_ext_is_unwritten(ex); + +	if (map->m_lblk + map->m_len < ee_block + ee_len) { +		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; +		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; +		if (unwritten) +			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | +				       EXT4_EXT_MARK_UNWRIT2; +		if (split_flag & EXT4_EXT_DATA_VALID2) +			split_flag1 |= EXT4_EXT_DATA_VALID1; +		err = ext4_split_extent_at(handle, inode, path, +				map->m_lblk + map->m_len, split_flag1, flags1); +		if (err) +			goto out; +	} else { +		allocated = ee_len - (map->m_lblk - ee_block); +	} +	/* +	 * Update path is required because previous ext4_split_extent_at() may +	 * result in split of original leaf or extent zeroout. +	 */ +	ext4_ext_drop_refs(path); +	path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); +	if (IS_ERR(path)) +		return PTR_ERR(path); +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	if (!ex) { +		EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +				 (unsigned long) map->m_lblk); +		return -EIO; +	} +	unwritten = ext4_ext_is_unwritten(ex); +	split_flag1 = 0; + +	if (map->m_lblk >= ee_block) { +		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; +		if (unwritten) { +			split_flag1 |= EXT4_EXT_MARK_UNWRIT1; +			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | +						     EXT4_EXT_MARK_UNWRIT2); +		} +		err = ext4_split_extent_at(handle, inode, path, +				map->m_lblk, split_flag1, flags); +		if (err) +			goto out; +	} + +	ext4_ext_show_leaf(inode, path); +out: +	return err ? err : allocated; +} +  /*   * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized - * extent into multiple extents (upto three - one initialized and two - * uninitialized). + * to an unwritten extent. It may result in splitting the unwritten + * extent into multiple extents (up to three - one initialized and two + * unwritten).   * There are three possibilities:   *   a> There is no split required: Entire extent should be initialized   *   b> Splits in two extents: Write is happening at either end of the extent   *   c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + *  - The extent pointed to by 'path' is unwritten. + *  - The extent pointed to by 'path' contains a superset + *    of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + *  - the returned value is the number of blocks beyond map->l_lblk + *    that are allocated and initialized. + *    It is guaranteed to be >= map->m_len.   */  static int ext4_ext_convert_to_initialized(handle_t *handle,  					   struct inode *inode,  					   struct ext4_map_blocks *map, -					   struct ext4_ext_path *path) +					   struct ext4_ext_path *path, +					   int flags)  { -	struct ext4_extent *ex, newex, orig_ex; -	struct ext4_extent *ex1 = NULL; -	struct ext4_extent *ex2 = NULL; -	struct ext4_extent *ex3 = NULL; +	struct ext4_sb_info *sbi;  	struct ext4_extent_header *eh; +	struct ext4_map_blocks split_map; +	struct ext4_extent zero_ex; +	struct ext4_extent *ex, *abut_ex;  	ext4_lblk_t ee_block, eof_block; -	unsigned int allocated, ee_len, depth; -	ext4_fsblk_t newblock; +	unsigned int ee_len, depth, map_len = map->m_len; +	int allocated = 0, max_zeroout = 0;  	int err = 0; -	int ret = 0; -	int may_zeroout; +	int split_flag = 0;  	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"  		"block %llu, max_blocks %u\n", inode->i_ino, -		(unsigned long long)map->m_lblk, map->m_len); +		(unsigned long long)map->m_lblk, map_len); +	sbi = EXT4_SB(inode->i_sb);  	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>  		inode->i_sb->s_blocksize_bits; -	if (eof_block < map->m_lblk + map->m_len) -		eof_block = map->m_lblk + map->m_len; +	if (eof_block < map->m_lblk + map_len) +		eof_block = map->m_lblk + map_len;  	depth = ext_depth(inode);  	eh = path[depth].p_hdr;  	ex = path[depth].p_ext;  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex); -	allocated = ee_len - (map->m_lblk - ee_block); -	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); +	zero_ex.ee_len = 0; -	ex2 = ex; -	orig_ex.ee_block = ex->ee_block; -	orig_ex.ee_len   = cpu_to_le16(ee_len); -	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); +	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); -	/* -	 * It is safe to convert extent to initialized via explicit -	 * zeroout only if extent is fully insde i_size or new_size. -	 */ -	may_zeroout = ee_block + ee_len <= eof_block; +	/* Pre-conditions */ +	BUG_ON(!ext4_ext_is_unwritten(ex)); +	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); -	err = ext4_ext_get_access(handle, inode, path + depth); -	if (err) -		goto out; -	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ -	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { -		err =  ext4_ext_zeroout(inode, &orig_ex); -		if (err) -			goto fix_extent_len; -		/* update the extent length and mark as initialized */ -		ex->ee_block = orig_ex.ee_block; -		ex->ee_len   = orig_ex.ee_len; -		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -		ext4_ext_dirty(handle, inode, path + depth); -		/* zeroed the full extent */ -		return allocated; -	} - -	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */ -	if (map->m_lblk > ee_block) { -		ex1 = ex; -		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); -		ext4_ext_mark_uninitialized(ex1); -		ex2 = &newex; -	}  	/* -	 * for sanity, update the length of the ex2 extent before -	 * we insert ex3, if ex1 is NULL. This is to avoid temporary -	 * overlap of blocks. +	 * Attempt to transfer newly initialized blocks from the currently +	 * unwritten extent to its neighbor. This is much cheaper +	 * than an insertion followed by a merge as those involve costly +	 * memmove() calls. Transferring to the left is the common case in +	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) +	 * followed by append writes. +	 * +	 * Limitations of the current logic: +	 *  - L1: we do not deal with writes covering the whole extent. +	 *    This would require removing the extent if the transfer +	 *    is possible. +	 *  - L2: we only attempt to merge with an extent stored in the +	 *    same extent tree node.  	 */ -	if (!ex1 && allocated > map->m_len) -		ex2->ee_len = cpu_to_le16(map->m_len); -	/* ex3: to ee_block + ee_len : uninitialised */ -	if (allocated > map->m_len) { -		unsigned int newdepth; -		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ -		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { -			/* -			 * map->m_lblk == ee_block is handled by the zerouout -			 * at the beginning. -			 * Mark first half uninitialized. -			 * Mark second half initialized and zero out the -			 * initialized extent -			 */ -			ex->ee_block = orig_ex.ee_block; -			ex->ee_len   = cpu_to_le16(ee_len - allocated); -			ext4_ext_mark_uninitialized(ex); -			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -			ext4_ext_dirty(handle, inode, path + depth); - -			ex3 = &newex; -			ex3->ee_block = cpu_to_le32(map->m_lblk); -			ext4_ext_store_pblock(ex3, newblock); -			ex3->ee_len = cpu_to_le16(allocated); -			err = ext4_ext_insert_extent(handle, inode, path, -							ex3, 0); -			if (err == -ENOSPC) { -				err =  ext4_ext_zeroout(inode, &orig_ex); -				if (err) -					goto fix_extent_len; -				ex->ee_block = orig_ex.ee_block; -				ex->ee_len   = orig_ex.ee_len; -				ext4_ext_store_pblock(ex, -					ext4_ext_pblock(&orig_ex)); -				ext4_ext_dirty(handle, inode, path + depth); -				/* blocks available from map->m_lblk */ -				return allocated; - -			} else if (err) -				goto fix_extent_len; +	if ((map->m_lblk == ee_block) && +		/* See if we can merge left */ +		(map_len < ee_len) &&		/*L1*/ +		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/ +		ext4_lblk_t prev_lblk; +		ext4_fsblk_t prev_pblk, ee_pblk; +		unsigned int prev_len; + +		abut_ex = ex - 1; +		prev_lblk = le32_to_cpu(abut_ex->ee_block); +		prev_len = ext4_ext_get_actual_len(abut_ex); +		prev_pblk = ext4_ext_pblock(abut_ex); +		ee_pblk = ext4_ext_pblock(ex); -			/* -			 * We need to zero out the second half because -			 * an fallocate request can update file size and -			 * converting the second half to initialized extent -			 * implies that we can leak some junk data to user -			 * space. -			 */ -			err =  ext4_ext_zeroout(inode, ex3); -			if (err) { -				/* -				 * We should actually mark the -				 * second half as uninit and return error -				 * Insert would have changed the extent -				 */ -				depth = ext_depth(inode); -				ext4_ext_drop_refs(path); -				path = ext4_ext_find_extent(inode, map->m_lblk, -							    path); -				if (IS_ERR(path)) { -					err = PTR_ERR(path); -					return err; -				} -				/* get the second half extent details */ -				ex = path[depth].p_ext; -				err = ext4_ext_get_access(handle, inode, -								path + depth); -				if (err) -					return err; -				ext4_ext_mark_uninitialized(ex); -				ext4_ext_dirty(handle, inode, path + depth); -				return err; -			} - -			/* zeroed the second half */ -			return allocated; -		} -		ex3 = &newex; -		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); -		ext4_ext_store_pblock(ex3, newblock + map->m_len); -		ex3->ee_len = cpu_to_le16(allocated - map->m_len); -		ext4_ext_mark_uninitialized(ex3); -		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); -		if (err == -ENOSPC && may_zeroout) { -			err =  ext4_ext_zeroout(inode, &orig_ex); -			if (err) -				goto fix_extent_len; -			/* update the extent length and mark as initialized */ -			ex->ee_block = orig_ex.ee_block; -			ex->ee_len   = orig_ex.ee_len; -			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -			ext4_ext_dirty(handle, inode, path + depth); -			/* zeroed the full extent */ -			/* blocks available from map->m_lblk */ -			return allocated; - -		} else if (err) -			goto fix_extent_len;  		/* -		 * The depth, and hence eh & ex might change -		 * as part of the insert above. +		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed +		 * upon those conditions: +		 * - C1: abut_ex is initialized, +		 * - C2: abut_ex is logically abutting ex, +		 * - C3: abut_ex is physically abutting ex, +		 * - C4: abut_ex can receive the additional blocks without +		 *   overflowing the (initialized) length limit.  		 */ -		newdepth = ext_depth(inode); -		/* -		 * update the extent length after successful insert of the -		 * split extent -		 */ -		ee_len -= ext4_ext_get_actual_len(ex3); -		orig_ex.ee_len = cpu_to_le16(ee_len); -		may_zeroout = ee_block + ee_len <= eof_block; +		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/ +			((prev_lblk + prev_len) == ee_block) &&		/*C2*/ +			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/ +			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/ +			err = ext4_ext_get_access(handle, inode, path + depth); +			if (err) +				goto out; -		depth = newdepth; -		ext4_ext_drop_refs(path); -		path = ext4_ext_find_extent(inode, map->m_lblk, path); -		if (IS_ERR(path)) { -			err = PTR_ERR(path); -			goto out; -		} -		eh = path[depth].p_hdr; -		ex = path[depth].p_ext; -		if (ex2 != &newex) -			ex2 = ex; +			trace_ext4_ext_convert_to_initialized_fastpath(inode, +				map, ex, abut_ex); -		err = ext4_ext_get_access(handle, inode, path + depth); -		if (err) -			goto out; +			/* Shift the start of ex by 'map_len' blocks */ +			ex->ee_block = cpu_to_le32(ee_block + map_len); +			ext4_ext_store_pblock(ex, ee_pblk + map_len); +			ex->ee_len = cpu_to_le16(ee_len - map_len); +			ext4_ext_mark_unwritten(ex); /* Restore the flag */ -		allocated = map->m_len; +			/* Extend abut_ex by 'map_len' blocks */ +			abut_ex->ee_len = cpu_to_le16(prev_len + map_len); -		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying -		 * to insert a extent in the middle zerout directly -		 * otherwise give the extent a chance to merge to left -		 */ -		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && -			map->m_lblk != ee_block && may_zeroout) { -			err =  ext4_ext_zeroout(inode, &orig_ex); -			if (err) -				goto fix_extent_len; -			/* update the extent length and mark as initialized */ -			ex->ee_block = orig_ex.ee_block; -			ex->ee_len   = orig_ex.ee_len; -			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -			ext4_ext_dirty(handle, inode, path + depth); -			/* zero out the first half */ -			/* blocks available from map->m_lblk */ -			return allocated; +			/* Result: number of initialized blocks past m_lblk */ +			allocated = map_len;  		} -	} -	/* -	 * If there was a change of depth as part of the -	 * insertion of ex3 above, we need to update the length -	 * of the ex1 extent again here -	 */ -	if (ex1 && ex1 != ex) { -		ex1 = ex; -		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); -		ext4_ext_mark_uninitialized(ex1); -		ex2 = &newex; -	} -	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ -	ex2->ee_block = cpu_to_le32(map->m_lblk); -	ext4_ext_store_pblock(ex2, newblock); -	ex2->ee_len = cpu_to_le16(allocated); -	if (ex2 != ex) -		goto insert; -	/* -	 * New (initialized) extent starts from the first block -	 * in the current extent. i.e., ex2 == ex -	 * We have to see if it can be merged with the extent -	 * on the left. -	 */ -	if (ex2 > EXT_FIRST_EXTENT(eh)) { +	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && +		   (map_len < ee_len) &&	/*L1*/ +		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/ +		/* See if we can merge right */ +		ext4_lblk_t next_lblk; +		ext4_fsblk_t next_pblk, ee_pblk; +		unsigned int next_len; + +		abut_ex = ex + 1; +		next_lblk = le32_to_cpu(abut_ex->ee_block); +		next_len = ext4_ext_get_actual_len(abut_ex); +		next_pblk = ext4_ext_pblock(abut_ex); +		ee_pblk = ext4_ext_pblock(ex); +  		/* -		 * To merge left, pass "ex2 - 1" to try_to_merge(), -		 * since it merges towards right _only_. +		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed +		 * upon those conditions: +		 * - C1: abut_ex is initialized, +		 * - C2: abut_ex is logically abutting ex, +		 * - C3: abut_ex is physically abutting ex, +		 * - C4: abut_ex can receive the additional blocks without +		 *   overflowing the (initialized) length limit.  		 */ -		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); -		if (ret) { -			err = ext4_ext_correct_indexes(handle, inode, path); +		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/ +		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/ +		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/ +		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/ +			err = ext4_ext_get_access(handle, inode, path + depth);  			if (err)  				goto out; -			depth = ext_depth(inode); -			ex2--; + +			trace_ext4_ext_convert_to_initialized_fastpath(inode, +				map, ex, abut_ex); + +			/* Shift the start of abut_ex by 'map_len' blocks */ +			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); +			ext4_ext_store_pblock(abut_ex, next_pblk - map_len); +			ex->ee_len = cpu_to_le16(ee_len - map_len); +			ext4_ext_mark_unwritten(ex); /* Restore the flag */ + +			/* Extend abut_ex by 'map_len' blocks */ +			abut_ex->ee_len = cpu_to_le16(next_len + map_len); + +			/* Result: number of initialized blocks past m_lblk */ +			allocated = map_len;  		}  	} +	if (allocated) { +		/* Mark the block containing both extents as dirty */ +		ext4_ext_dirty(handle, inode, path + depth); + +		/* Update path to point to the right extent */ +		path[depth].p_ext = abut_ex; +		goto out; +	} else +		allocated = ee_len - (map->m_lblk - ee_block); + +	WARN_ON(map->m_lblk < ee_block);  	/* -	 * Try to Merge towards right. This might be required -	 * only when the whole extent is being written to. -	 * i.e. ex2 == ex and ex3 == NULL. +	 * It is safe to convert extent to initialized via explicit +	 * zeroout only if extent is fully inside i_size or new_size.  	 */ -	if (!ex3) { -		ret = ext4_ext_try_to_merge(inode, path, ex2); -		if (ret) { -			err = ext4_ext_correct_indexes(handle, inode, path); +	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + +	if (EXT4_EXT_MAY_ZEROOUT & split_flag) +		max_zeroout = sbi->s_extent_max_zeroout_kb >> +			(inode->i_sb->s_blocksize_bits - 10); + +	/* If extent is less than s_max_zeroout_kb, zeroout directly */ +	if (max_zeroout && (ee_len <= max_zeroout)) { +		err = ext4_ext_zeroout(inode, ex); +		if (err) +			goto out; +		zero_ex.ee_block = ex->ee_block; +		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); +		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); + +		err = ext4_ext_get_access(handle, inode, path + depth); +		if (err) +			goto out; +		ext4_ext_mark_initialized(ex); +		ext4_ext_try_to_merge(handle, inode, path, ex); +		err = ext4_ext_dirty(handle, inode, path + path->p_depth); +		goto out; +	} + +	/* +	 * four cases: +	 * 1. split the extent into three extents. +	 * 2. split the extent into two extents, zeroout the first half. +	 * 3. split the extent into two extents, zeroout the second half. +	 * 4. split the extent into two extents with out zeroout. +	 */ +	split_map.m_lblk = map->m_lblk; +	split_map.m_len = map->m_len; + +	if (max_zeroout && (allocated > map->m_len)) { +		if (allocated <= max_zeroout) { +			/* case 3 */ +			zero_ex.ee_block = +					 cpu_to_le32(map->m_lblk); +			zero_ex.ee_len = cpu_to_le16(allocated); +			ext4_ext_store_pblock(&zero_ex, +				ext4_ext_pblock(ex) + map->m_lblk - ee_block); +			err = ext4_ext_zeroout(inode, &zero_ex);  			if (err)  				goto out; +			split_map.m_lblk = map->m_lblk; +			split_map.m_len = allocated; +		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { +			/* case 2 */ +			if (map->m_lblk != ee_block) { +				zero_ex.ee_block = ex->ee_block; +				zero_ex.ee_len = cpu_to_le16(map->m_lblk - +							ee_block); +				ext4_ext_store_pblock(&zero_ex, +						      ext4_ext_pblock(ex)); +				err = ext4_ext_zeroout(inode, &zero_ex); +				if (err) +					goto out; +			} + +			split_map.m_lblk = ee_block; +			split_map.m_len = map->m_lblk - ee_block + map->m_len; +			allocated = map->m_len;  		}  	} -	/* Mark modified extent as dirty */ -	err = ext4_ext_dirty(handle, inode, path + depth); -	goto out; -insert: -	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); -	if (err == -ENOSPC && may_zeroout) { -		err =  ext4_ext_zeroout(inode, &orig_ex); -		if (err) -			goto fix_extent_len; -		/* update the extent length and mark as initialized */ -		ex->ee_block = orig_ex.ee_block; -		ex->ee_len   = orig_ex.ee_len; -		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -		ext4_ext_dirty(handle, inode, path + depth); -		/* zero out the first half */ -		return allocated; -	} else if (err) -		goto fix_extent_len; + +	allocated = ext4_split_extent(handle, inode, path, +				      &split_map, split_flag, flags); +	if (allocated < 0) +		err = allocated; +  out: -	ext4_ext_show_leaf(inode, path); +	/* If we have gotten a failure, don't zero out status tree */ +	if (!err) +		err = ext4_zeroout_es(inode, &zero_ex);  	return err ? err : allocated; - -fix_extent_len: -	ex->ee_block = orig_ex.ee_block; -	ex->ee_len   = orig_ex.ee_len; -	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -	ext4_ext_mark_uninitialized(ex); -	ext4_ext_dirty(handle, inode, path + depth); -	return err;  }  /*   * This function is called by ext4_ext_map_blocks() from   * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent.   * - * Writing to an uninitized extent may result in splitting the uninitialized - * extent into multiple /intialized unintialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three)   * There are three possibilities: - *   a> There is no split required: Entire extent should be uninitialized + *   a> There is no split required: Entire extent should be unwritten   *   b> Splits in two extents: Write is happening at either end of the extent   *   c> Splits in three extents: Somone is writing in middle of the extent   * + * This works the same way in the case of initialized -> unwritten conversion. + *   * One of more index blocks maybe needed if the extent tree grow after - * the unintialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part   * being filled will be convert to initialized by the end_io callback function   * via ext4_convert_unwritten_extents().   * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success.   */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle,  					struct inode *inode,  					struct ext4_map_blocks *map,  					struct ext4_ext_path *path,  					int flags)  { -	struct ext4_extent *ex, newex, orig_ex; -	struct ext4_extent *ex1 = NULL; -	struct ext4_extent *ex2 = NULL; -	struct ext4_extent *ex3 = NULL; -	ext4_lblk_t ee_block, eof_block; -	unsigned int allocated, ee_len, depth; -	ext4_fsblk_t newblock; -	int err = 0; -	int may_zeroout; +	ext4_lblk_t eof_block; +	ext4_lblk_t ee_block; +	struct ext4_extent *ex; +	unsigned int ee_len; +	int split_flag = 0, depth; -	ext_debug("ext4_split_unwritten_extents: inode %lu, logical" -		"block %llu, max_blocks %u\n", inode->i_ino, -		(unsigned long long)map->m_lblk, map->m_len); +	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", +		  __func__, inode->i_ino, +		  (unsigned long long)map->m_lblk, map->m_len);  	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>  		inode->i_sb->s_blocksize_bits;  	if (eof_block < map->m_lblk + map->m_len)  		eof_block = map->m_lblk + map->m_len; - +	/* +	 * It is safe to convert extent to initialized via explicit +	 * zeroout only if extent is fully insde i_size or new_size. +	 */  	depth = ext_depth(inode);  	ex = path[depth].p_ext;  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex); -	allocated = ee_len - (map->m_lblk - ee_block); -	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); -	ex2 = ex; -	orig_ex.ee_block = ex->ee_block; -	orig_ex.ee_len   = cpu_to_le16(ee_len); -	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); +	/* Convert to unwritten */ +	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { +		split_flag |= EXT4_EXT_DATA_VALID1; +	/* Convert to initialized */ +	} else if (flags & EXT4_GET_BLOCKS_CONVERT) { +		split_flag |= ee_block + ee_len <= eof_block ? +			      EXT4_EXT_MAY_ZEROOUT : 0; +		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); +	} +	flags |= EXT4_GET_BLOCKS_PRE_IO; +	return ext4_split_extent(handle, inode, path, map, split_flag, flags); +} -	/* -	 * It is safe to convert extent to initialized via explicit -	 * zeroout only if extent is fully insde i_size or new_size. -	 */ -	may_zeroout = ee_block + ee_len <= eof_block; +static int ext4_convert_initialized_extents(handle_t *handle, +					    struct inode *inode, +					    struct ext4_map_blocks *map, +					    struct ext4_ext_path *path) +{ +	struct ext4_extent *ex; +	ext4_lblk_t ee_block; +	unsigned int ee_len; +	int depth; +	int err = 0; -	/* - 	 * If the uninitialized extent begins at the same logical - 	 * block where the write begins, and the write completely - 	 * covers the extent, then we don't need to split it. - 	 */ -	if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) -		return allocated; +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	ee_block = le32_to_cpu(ex->ee_block); +	ee_len = ext4_ext_get_actual_len(ex); -	err = ext4_ext_get_access(handle, inode, path + depth); -	if (err) -		goto out; -	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */ -	if (map->m_lblk > ee_block) { -		ex1 = ex; -		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); -		ext4_ext_mark_uninitialized(ex1); -		ex2 = &newex; -	} -	/* -	 * for sanity, update the length of the ex2 extent before -	 * we insert ex3, if ex1 is NULL. This is to avoid temporary -	 * overlap of blocks. -	 */ -	if (!ex1 && allocated > map->m_len) -		ex2->ee_len = cpu_to_le16(map->m_len); -	/* ex3: to ee_block + ee_len : uninitialised */ -	if (allocated > map->m_len) { -		unsigned int newdepth; -		ex3 = &newex; -		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); -		ext4_ext_store_pblock(ex3, newblock + map->m_len); -		ex3->ee_len = cpu_to_le16(allocated - map->m_len); -		ext4_ext_mark_uninitialized(ex3); -		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); -		if (err == -ENOSPC && may_zeroout) { -			err =  ext4_ext_zeroout(inode, &orig_ex); -			if (err) -				goto fix_extent_len; -			/* update the extent length and mark as initialized */ -			ex->ee_block = orig_ex.ee_block; -			ex->ee_len   = orig_ex.ee_len; -			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -			ext4_ext_dirty(handle, inode, path + depth); -			/* zeroed the full extent */ -			/* blocks available from map->m_lblk */ -			return allocated; - -		} else if (err) -			goto fix_extent_len; -		/* -		 * The depth, and hence eh & ex might change -		 * as part of the insert above. -		 */ -		newdepth = ext_depth(inode); -		/* -		 * update the extent length after successful insert of the -		 * split extent -		 */ -		ee_len -= ext4_ext_get_actual_len(ex3); -		orig_ex.ee_len = cpu_to_le16(ee_len); -		may_zeroout = ee_block + ee_len <= eof_block; +	ext_debug("%s: inode %lu, logical" +		"block %llu, max_blocks %u\n", __func__, inode->i_ino, +		  (unsigned long long)ee_block, ee_len); -		depth = newdepth; +	if (ee_block != map->m_lblk || ee_len > map->m_len) { +		err = ext4_split_convert_extents(handle, inode, map, path, +				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); +		if (err < 0) +			goto out;  		ext4_ext_drop_refs(path); -		path = ext4_ext_find_extent(inode, map->m_lblk, path); +		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);  		if (IS_ERR(path)) {  			err = PTR_ERR(path);  			goto out;  		} +		depth = ext_depth(inode);  		ex = path[depth].p_ext; -		if (ex2 != &newex) -			ex2 = ex; - -		err = ext4_ext_get_access(handle, inode, path + depth); -		if (err) +		if (!ex) { +			EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +					 (unsigned long) map->m_lblk); +			err = -EIO;  			goto out; - -		allocated = map->m_len; -	} -	/* -	 * If there was a change of depth as part of the -	 * insertion of ex3 above, we need to update the length -	 * of the ex1 extent again here -	 */ -	if (ex1 && ex1 != ex) { -		ex1 = ex; -		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); -		ext4_ext_mark_uninitialized(ex1); -		ex2 = &newex; +		}  	} -	/* -	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written -	 * using direct I/O, uninitialised still. + +	err = ext4_ext_get_access(handle, inode, path + depth); +	if (err) +		goto out; +	/* first mark the extent as unwritten */ +	ext4_ext_mark_unwritten(ex); + +	/* note: ext4_ext_correct_indexes() isn't needed here because +	 * borders are not changed  	 */ -	ex2->ee_block = cpu_to_le32(map->m_lblk); -	ext4_ext_store_pblock(ex2, newblock); -	ex2->ee_len = cpu_to_le16(allocated); -	ext4_ext_mark_uninitialized(ex2); -	if (ex2 != ex) -		goto insert; +	ext4_ext_try_to_merge(handle, inode, path, ex); +  	/* Mark modified extent as dirty */ -	err = ext4_ext_dirty(handle, inode, path + depth); -	ext_debug("out here\n"); -	goto out; -insert: -	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); -	if (err == -ENOSPC && may_zeroout) { -		err =  ext4_ext_zeroout(inode, &orig_ex); -		if (err) -			goto fix_extent_len; -		/* update the extent length and mark as initialized */ -		ex->ee_block = orig_ex.ee_block; -		ex->ee_len   = orig_ex.ee_len; -		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -		ext4_ext_dirty(handle, inode, path + depth); -		/* zero out the first half */ -		return allocated; -	} else if (err) -		goto fix_extent_len; +	err = ext4_ext_dirty(handle, inode, path + path->p_depth);  out:  	ext4_ext_show_leaf(inode, path); -	return err ? err : allocated; - -fix_extent_len: -	ex->ee_block = orig_ex.ee_block; -	ex->ee_len   = orig_ex.ee_len; -	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); -	ext4_ext_mark_uninitialized(ex); -	ext4_ext_dirty(handle, inode, path + depth);  	return err;  } + +  static int ext4_convert_unwritten_extents_endio(handle_t *handle, -					      struct inode *inode, -					      struct ext4_ext_path *path) +						struct inode *inode, +						struct ext4_map_blocks *map, +						struct ext4_ext_path *path)  {  	struct ext4_extent *ex; -	struct ext4_extent_header *eh; +	ext4_lblk_t ee_block; +	unsigned int ee_len;  	int depth;  	int err = 0; -	int ret = 0;  	depth = ext_depth(inode); -	eh = path[depth].p_hdr;  	ex = path[depth].p_ext; +	ee_block = le32_to_cpu(ex->ee_block); +	ee_len = ext4_ext_get_actual_len(ex); + +	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" +		"block %llu, max_blocks %u\n", inode->i_ino, +		  (unsigned long long)ee_block, ee_len); + +	/* If extent is larger than requested it is a clear sign that we still +	 * have some extent state machine issues left. So extent_split is still +	 * required. +	 * TODO: Once all related issues will be fixed this situation should be +	 * illegal. +	 */ +	if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG +		ext4_warning("Inode (%ld) finished: extent logical block %llu," +			     " len %u; IO logical block %llu, len %u\n", +			     inode->i_ino, (unsigned long long)ee_block, ee_len, +			     (unsigned long long)map->m_lblk, map->m_len); +#endif +		err = ext4_split_convert_extents(handle, inode, map, path, +						 EXT4_GET_BLOCKS_CONVERT); +		if (err < 0) +			goto out; +		ext4_ext_drop_refs(path); +		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); +		if (IS_ERR(path)) { +			err = PTR_ERR(path); +			goto out; +		} +		depth = ext_depth(inode); +		ex = path[depth].p_ext; +	}  	err = ext4_ext_get_access(handle, inode, path + depth);  	if (err) @@ -3035,36 +3780,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,  	/* first mark the extent as initialized */  	ext4_ext_mark_initialized(ex); -	/* -	 * We have to see if it can be merged with the extent -	 * on the left. -	 */ -	if (ex > EXT_FIRST_EXTENT(eh)) { -		/* -		 * To merge left, pass "ex - 1" to try_to_merge(), -		 * since it merges towards right _only_. -		 */ -		ret = ext4_ext_try_to_merge(inode, path, ex - 1); -		if (ret) { -			err = ext4_ext_correct_indexes(handle, inode, path); -			if (err) -				goto out; -			depth = ext_depth(inode); -			ex--; -		} -	} -	/* -	 * Try to Merge towards right. +	/* note: ext4_ext_correct_indexes() isn't needed here because +	 * borders are not changed  	 */ -	ret = ext4_ext_try_to_merge(inode, path, ex); -	if (ret) { -		err = ext4_ext_correct_indexes(handle, inode, path); -		if (err) -			goto out; -		depth = ext_depth(inode); -	} +	ext4_ext_try_to_merge(handle, inode, path, ex); +  	/* Mark modified extent as dirty */ -	err = ext4_ext_dirty(handle, inode, path + depth); +	err = ext4_ext_dirty(handle, inode, path + path->p_depth);  out:  	ext4_ext_show_leaf(inode, path);  	return err; @@ -3082,26 +3804,27 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,   * Handle EOFBLOCKS_FL flag, clearing it if necessary   */  static int check_eofblocks_fl(handle_t *handle, struct inode *inode, -			      struct ext4_map_blocks *map, +			      ext4_lblk_t lblk,  			      struct ext4_ext_path *path,  			      unsigned int len)  {  	int i, depth;  	struct ext4_extent_header *eh; -	struct ext4_extent *ex, *last_ex; +	struct ext4_extent *last_ex;  	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))  		return 0;  	depth = ext_depth(inode);  	eh = path[depth].p_hdr; -	ex = path[depth].p_ext; -	if (unlikely(!eh->eh_entries)) { -		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " -				 "EOFBLOCKS_FL set"); -		return -EIO; -	} +	/* +	 * We're going to remove EOFBLOCKS_FL entirely in future so we +	 * do not care for this case anymore. Simply remove the flag +	 * if there are no extents. +	 */ +	if (unlikely(!eh->eh_entries)) +		goto out;  	last_ex = EXT_LAST_EXTENT(eh);  	/*  	 * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3112,7 +3835,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,  	 * this turns out to be false, we can bail out from this  	 * function immediately.  	 */ -	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + +	if (lblk + len < le32_to_cpu(last_ex->ee_block) +  	    ext4_ext_get_actual_len(last_ex))  		return 0;  	/* @@ -3125,53 +3848,209 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,  	for (i = depth-1; i >= 0; i--)  		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))  			return 0; +out:  	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	return ext4_mark_inode_dirty(handle, inode);  } +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Return 1 if there is a delalloc block in the range, otherwise 0. + */ +int ext4_find_delalloc_range(struct inode *inode, +			     ext4_lblk_t lblk_start, +			     ext4_lblk_t lblk_end) +{ +	struct extent_status es; + +	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); +	if (es.es_len == 0) +		return 0; /* there is no delay extent in this tree */ +	else if (es.es_lblk <= lblk_start && +		 lblk_start < es.es_lblk + es.es_len) +		return 1; +	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) +		return 1; +	else +		return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) +{ +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	ext4_lblk_t lblk_start, lblk_end; +	lblk_start = EXT4_LBLK_CMASK(sbi, lblk); +	lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + +	return ext4_find_delalloc_range(inode, lblk_start, lblk_end); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + *  '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + *     are not delalloc'ed. + *	Ex: + *	|----c---=|====c====|====c====|===-c----| + *	         |++++++ allocated ++++++| + *	==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + *     marked for delayed allocation. In this case, we will exclude that + *     cluster. + *	Ex: + *	|----====c========|========c========| + *	     |++++++ allocated ++++++| + *	==> 1 complete clusters in above example + * + *	Ex: + *	|================c================| + *            |++++++ allocated ++++++| + *	==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, +			   unsigned int num_blks) +{ +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	ext4_lblk_t alloc_cluster_start, alloc_cluster_end; +	ext4_lblk_t lblk_from, lblk_to, c_offset; +	unsigned int allocated_clusters = 0; + +	alloc_cluster_start = EXT4_B2C(sbi, lblk_start); +	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + +	/* max possible clusters for this allocation */ +	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + +	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + +	/* Check towards left side */ +	c_offset = EXT4_LBLK_COFF(sbi, lblk_start); +	if (c_offset) { +		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); +		lblk_to = lblk_from + c_offset - 1; + +		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) +			allocated_clusters--; +	} + +	/* Now check towards right. */ +	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); +	if (allocated_clusters && c_offset) { +		lblk_from = lblk_start + num_blks; +		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + +		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) +			allocated_clusters--; +	} + +	return allocated_clusters; +} + +static int +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, +			struct ext4_map_blocks *map, +			struct ext4_ext_path *path, int flags, +			unsigned int allocated, ext4_fsblk_t newblock) +{ +	int ret = 0; +	int err = 0; + +	/* +	 * Make sure that the extent is no bigger than we support with +	 * unwritten extent +	 */ +	if (map->m_len > EXT_UNWRITTEN_MAX_LEN) +		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + +	ret = ext4_convert_initialized_extents(handle, inode, map, +						path); +	if (ret >= 0) { +		ext4_update_inode_fsync_trans(handle, inode, 1); +		err = check_eofblocks_fl(handle, inode, map->m_lblk, +					 path, map->m_len); +	} else +		err = ret; +	map->m_flags |= EXT4_MAP_UNWRITTEN; +	if (allocated > map->m_len) +		allocated = map->m_len; +	map->m_len = allocated; + +	return err ? err : allocated; +} +  static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,  			struct ext4_map_blocks *map,  			struct ext4_ext_path *path, int flags,  			unsigned int allocated, ext4_fsblk_t newblock)  {  	int ret = 0;  	int err = 0; -	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; +	ext4_io_end_t *io = ext4_inode_aio(inode); -	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" -		  "block %llu, max_blocks %u, flags %d, allocated %u", +	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " +		  "block %llu, max_blocks %u, flags %x, allocated %u\n",  		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,  		  flags, allocated);  	ext4_ext_show_leaf(inode, path); +	/* +	 * When writing into unwritten space, we should not fail to +	 * allocate metadata blocks for the new extent block if needed. +	 */ +	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + +	trace_ext4_ext_handle_unwritten_extents(inode, map, flags, +						    allocated, newblock); +  	/* get_block() before submit the IO, split the extent */ -	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { -		ret = ext4_split_unwritten_extents(handle, inode, map, -						   path, flags); +	if (flags & EXT4_GET_BLOCKS_PRE_IO) { +		ret = ext4_split_convert_extents(handle, inode, map, +					 path, flags | EXT4_GET_BLOCKS_CONVERT); +		if (ret <= 0) +			goto out;  		/*  		 * Flag the inode(non aio case) or end_io struct (aio case) -		 * that this IO needs to convertion to written when IO is +		 * that this IO needs to conversion to written when IO is  		 * completed  		 */  		if (io) -			io->flag = EXT4_IO_END_UNWRITTEN; +			ext4_set_io_unwritten_flag(inode, io);  		else  			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); -		if (ext4_should_dioread_nolock(inode)) -			map->m_flags |= EXT4_MAP_UNINIT; +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		goto out;  	}  	/* IO end_io complete, convert the filled extent to written */ -	if ((flags & EXT4_GET_BLOCKS_CONVERT)) { -		ret = ext4_convert_unwritten_extents_endio(handle, inode, +	if (flags & EXT4_GET_BLOCKS_CONVERT) { +		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,  							path);  		if (ret >= 0) {  			ext4_update_inode_fsync_trans(handle, inode, 1); -			err = check_eofblocks_fl(handle, inode, map, path, -						 map->m_len); +			err = check_eofblocks_fl(handle, inode, map->m_lblk, +						 path, map->m_len);  		} else  			err = ret; +		map->m_flags |= EXT4_MAP_MAPPED; +		map->m_pblk = newblock; +		if (allocated > map->m_len) +			allocated = map->m_len; +		map->m_len = allocated;  		goto out2;  	}  	/* buffered IO case */ @@ -3179,8 +4058,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  	 * repeat fallocate creation request  	 * we already have an unwritten extent  	 */ -	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) +	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		goto map_out; +	}  	/* buffered READ or buffered write_begin() lookup */  	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3196,14 +4077,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  	}  	/* buffered write, writepage time, convert*/ -	ret = ext4_ext_convert_to_initialized(handle, inode, map, path); -	if (ret >= 0) { +	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); +	if (ret >= 0)  		ext4_update_inode_fsync_trans(handle, inode, 1); -		err = check_eofblocks_fl(handle, inode, map, path, map->m_len); -		if (err < 0) -			goto out2; -	} -  out:  	if (ret <= 0) {  		err = ret; @@ -3224,6 +4100,7 @@ out:  					allocated - map->m_len);  		allocated = map->m_len;  	} +	map->m_len = allocated;  	/*  	 * If we have done fallocate with the offset that is already @@ -3232,11 +4109,24 @@ out:  	 * But fallocate would have already updated quota and block  	 * count for this offset. So cancel these reservation  	 */ -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) -		ext4_da_update_reserve_space(inode, allocated, 0); +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { +		unsigned int reserved_clusters; +		reserved_clusters = get_reserved_cluster_alloc(inode, +				map->m_lblk, map->m_len); +		if (reserved_clusters) +			ext4_da_update_reserve_space(inode, +						     reserved_clusters, +						     0); +	}  map_out:  	map->m_flags |= EXT4_MAP_MAPPED; +	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { +		err = check_eofblocks_fl(handle, inode, map->m_lblk, path, +					 map->m_len); +		if (err < 0) +			goto out2; +	}  out1:  	if (allocated > map->m_len)  		allocated = map->m_len; @@ -3244,14 +4134,113 @@ out1:  	map->m_pblk = newblock;  	map->m_len = allocated;  out2: -	if (path) { -		ext4_ext_drop_refs(path); -		kfree(path); -	}  	return err ? err : allocated;  }  /* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + *	@sb	The filesystem superblock structure + *	@map	The requested lblk->pblk mapping + *	@ex	The extent structure which might contain an implied + *			cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree.  Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree.  There are three cases we + * want to catch.  The first is this case: + * + *		 |--- cluster # N--| + *    |--- extent ---|	|---- requested region ---| + *			|==========| + * + * The second case that we need to test for is this one: + * + *   |--------- cluster # N ----------------| + *	   |--- requested region --|   |------- extent ----| + *	   |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + *          |------------- cluster # N-------------| + * |----- ex -----|                  |---- ex_right ----| + *                  |------ requested region ------| + *                  |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region.  Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, +				     struct ext4_map_blocks *map, +				     struct ext4_extent *ex, +				     struct ext4_ext_path *path) +{ +	struct ext4_sb_info *sbi = EXT4_SB(sb); +	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); +	ext4_lblk_t ex_cluster_start, ex_cluster_end; +	ext4_lblk_t rr_cluster_start; +	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); +	ext4_fsblk_t ee_start = ext4_ext_pblock(ex); +	unsigned short ee_len = ext4_ext_get_actual_len(ex); + +	/* The extent passed in that we are trying to match */ +	ex_cluster_start = EXT4_B2C(sbi, ee_block); +	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + +	/* The requested region passed into ext4_map_blocks() */ +	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + +	if ((rr_cluster_start == ex_cluster_end) || +	    (rr_cluster_start == ex_cluster_start)) { +		if (rr_cluster_start == ex_cluster_end) +			ee_start += ee_len - 1; +		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; +		map->m_len = min(map->m_len, +				 (unsigned) sbi->s_cluster_ratio - c_offset); +		/* +		 * Check for and handle this case: +		 * +		 *   |--------- cluster # N-------------| +		 *		       |------- extent ----| +		 *	   |--- requested region ---| +		 *	   |===========| +		 */ + +		if (map->m_lblk < ee_block) +			map->m_len = min(map->m_len, ee_block - map->m_lblk); + +		/* +		 * Check for the case where there is already another allocated +		 * block to the right of 'ex' but before the end of the cluster. +		 * +		 *          |------------- cluster # N-------------| +		 * |----- ex -----|                  |---- ex_right ----| +		 *                  |------ requested region ------| +		 *                  |================| +		 */ +		if (map->m_lblk > ee_block) { +			ext4_lblk_t next = ext4_ext_next_allocated_block(path); +			map->m_len = min(map->m_len, next - map->m_lblk); +		} + +		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); +		return 1; +	} + +	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); +	return 0; +} + + +/*   * Block allocation/map/preallocation routine for extents based files   *   * @@ -3273,45 +4262,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			struct ext4_map_blocks *map, int flags)  {  	struct ext4_ext_path *path = NULL; -	struct ext4_extent_header *eh; -	struct ext4_extent newex, *ex; -	ext4_fsblk_t newblock; -	int err = 0, depth, ret, cache_type; -	unsigned int allocated = 0; +	struct ext4_extent newex, *ex, *ex2; +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	ext4_fsblk_t newblock = 0; +	int free_on_err = 0, err = 0, depth, ret; +	unsigned int allocated = 0, offset = 0; +	unsigned int allocated_clusters = 0;  	struct ext4_allocation_request ar; -	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; +	ext4_io_end_t *io = ext4_inode_aio(inode); +	ext4_lblk_t cluster_offset; +	int set_unwritten = 0;  	ext_debug("blocks %u/%u requested for inode %lu\n",  		  map->m_lblk, map->m_len, inode->i_ino); - -	/* check in cache */ -	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); -	if (cache_type) { -		if (cache_type == EXT4_EXT_CACHE_GAP) { -			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { -				/* -				 * block isn't allocated yet and -				 * user doesn't want to allocate it -				 */ -				goto out2; -			} -			/* we should allocate requested block */ -		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) { -			/* block is already allocated */ -			newblock = map->m_lblk -				   - le32_to_cpu(newex.ee_block) -				   + ext4_ext_pblock(&newex); -			/* number of remaining blocks in the extent */ -			allocated = ext4_ext_get_actual_len(&newex) - -				(map->m_lblk - le32_to_cpu(newex.ee_block)); -			goto out; -		} else { -			BUG(); -		} -	} +	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);  	/* find extent for this block */ -	path = ext4_ext_find_extent(inode, map->m_lblk, NULL); +	path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);  	if (IS_ERR(path)) {  		err = PTR_ERR(path);  		path = NULL; @@ -3333,7 +4300,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		err = -EIO;  		goto out2;  	} -	eh = path[depth].p_hdr;  	ex = path[depth].p_ext;  	if (ex) { @@ -3341,11 +4307,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);  		unsigned short ee_len; +  		/* -		 * Uninitialized extents are treated as holes, except that +		 * unwritten extents are treated as holes, except that  		 * we split out initialized portions during a write.  		 */  		ee_len = ext4_ext_get_actual_len(ex); + +		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); +  		/* if found extent covers block, simply return it */  		if (in_range(map->m_lblk, ee_block, ee_len)) {  			newblock = map->m_lblk - ee_block + ee_start; @@ -3354,20 +4324,34 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,  				  ee_block, ee_len, newblock); -			/* Do not put uninitialized extent in the cache */ -			if (!ext4_ext_is_uninitialized(ex)) { -				ext4_ext_put_in_cache(inode, ee_block, -							ee_len, ee_start, -							EXT4_EXT_CACHE_EXTENT); +			/* +			 * If the extent is initialized check whether the +			 * caller wants to convert it to unwritten. +			 */ +			if ((!ext4_ext_is_unwritten(ex)) && +			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { +				allocated = ext4_ext_convert_initialized_extent( +						handle, inode, map, path, flags, +						allocated, newblock); +				goto out2; +			} else if (!ext4_ext_is_unwritten(ex))  				goto out; -			} -			ret = ext4_ext_handle_uninitialized_extents(handle, -					inode, map, path, flags, allocated, -					newblock); -			return ret; + +			ret = ext4_ext_handle_unwritten_extents( +				handle, inode, map, path, flags, +				allocated, newblock); +			if (ret < 0) +				err = ret; +			else +				allocated = ret; +			goto out2;  		}  	} +	if ((sbi->s_cluster_ratio > 1) && +	    ext4_find_delalloc_cluster(inode, map->m_lblk)) +		map->m_flags |= EXT4_MAP_FROM_CLUSTER; +  	/*  	 * requested block isn't allocated yet;  	 * we couldn't try to create block if create flag is zero @@ -3377,12 +4361,29 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		 * put just found gap into cache to speed up  		 * subsequent requests  		 */ -		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); +		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) +			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);  		goto out2;  	} +  	/*  	 * Okay, we need to do block allocation.  	 */ +	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; +	newex.ee_block = cpu_to_le32(map->m_lblk); +	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); + +	/* +	 * If we are doing bigalloc, check to see if the extent returned +	 * by ext4_ext_find_extent() implies a cluster we can use. +	 */ +	if (cluster_offset && ex && +	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { +		ar.len = allocated = map->m_len; +		newblock = map->m_pblk; +		map->m_flags |= EXT4_MAP_FROM_CLUSTER; +		goto got_allocated_blocks; +	}  	/* find neighbour allocated blocks */  	ar.lleft = map->m_lblk; @@ -3390,27 +4391,37 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	if (err)  		goto out2;  	ar.lright = map->m_lblk; -	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); +	ex2 = NULL; +	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);  	if (err)  		goto out2; +	/* Check if the extent after searching to the right implies a +	 * cluster we can use. */ +	if ((sbi->s_cluster_ratio > 1) && ex2 && +	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { +		ar.len = allocated = map->m_len; +		newblock = map->m_pblk; +		map->m_flags |= EXT4_MAP_FROM_CLUSTER; +		goto got_allocated_blocks; +	} +  	/*  	 * See if request is beyond maximum number of blocks we can have in  	 * a single extent. For an initialized extent this limit is -	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is -	 * EXT_UNINIT_MAX_LEN. +	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is +	 * EXT_UNWRITTEN_MAX_LEN.  	 */  	if (map->m_len > EXT_INIT_MAX_LEN && -	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) +	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))  		map->m_len = EXT_INIT_MAX_LEN; -	else if (map->m_len > EXT_UNINIT_MAX_LEN && -		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) -		map->m_len = EXT_UNINIT_MAX_LEN; +	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && +		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) +		map->m_len = EXT_UNWRITTEN_MAX_LEN;  	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ -	newex.ee_block = cpu_to_le32(map->m_lblk);  	newex.ee_len = cpu_to_le16(map->m_len); -	err = ext4_ext_check_overlap(inode, &newex, path); +	err = ext4_ext_check_overlap(sbi, inode, &newex, path);  	if (err)  		allocated = ext4_ext_get_actual_len(&newex);  	else @@ -3420,54 +4431,80 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	ar.inode = inode;  	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);  	ar.logical = map->m_lblk; -	ar.len = allocated; +	/* +	 * We calculate the offset from the beginning of the cluster +	 * for the logical block number, since when we allocate a +	 * physical cluster, the physical block should start at the +	 * same offset from the beginning of the cluster.  This is +	 * needed so that future calls to get_implied_cluster_alloc() +	 * work correctly. +	 */ +	offset = EXT4_LBLK_COFF(sbi, map->m_lblk); +	ar.len = EXT4_NUM_B2C(sbi, offset+allocated); +	ar.goal -= offset; +	ar.logical -= offset;  	if (S_ISREG(inode->i_mode))  		ar.flags = EXT4_MB_HINT_DATA;  	else  		/* disable in-core preallocation for non-regular files */  		ar.flags = 0; +	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) +		ar.flags |= EXT4_MB_HINT_NOPREALLOC;  	newblock = ext4_mb_new_blocks(handle, &ar, &err);  	if (!newblock)  		goto out2;  	ext_debug("allocate new block: goal %llu, found %llu/%u\n",  		  ar.goal, newblock, allocated); +	free_on_err = 1; +	allocated_clusters = ar.len; +	ar.len = EXT4_C2B(sbi, ar.len) - offset; +	if (ar.len > allocated) +		ar.len = allocated; +got_allocated_blocks:  	/* try to insert new extent into found leaf and return */ -	ext4_ext_store_pblock(&newex, newblock); +	ext4_ext_store_pblock(&newex, newblock + offset);  	newex.ee_len = cpu_to_le16(ar.len); -	/* Mark uninitialized */ -	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ -		ext4_ext_mark_uninitialized(&newex); +	/* Mark unwritten */ +	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ +		ext4_ext_mark_unwritten(&newex); +		map->m_flags |= EXT4_MAP_UNWRITTEN;  		/*  		 * io_end structure was created for every IO write to an -		 * uninitialized extent. To avoid unecessary conversion, +		 * unwritten extent. To avoid unnecessary conversion,  		 * here we flag the IO that really needs the conversion.  		 * For non asycn direct IO case, flag the inode state -		 * that we need to perform convertion when IO is done. +		 * that we need to perform conversion when IO is done.  		 */ -		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { -			if (io) -				io->flag = EXT4_IO_END_UNWRITTEN; -			else -				ext4_set_inode_state(inode, -						     EXT4_STATE_DIO_UNWRITTEN); -		} -		if (ext4_should_dioread_nolock(inode)) -			map->m_flags |= EXT4_MAP_UNINIT; +		if (flags & EXT4_GET_BLOCKS_PRE_IO) +			set_unwritten = 1;  	} -	err = check_eofblocks_fl(handle, inode, map, path, ar.len); -	if (err) -		goto out2; +	err = 0; +	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) +		err = check_eofblocks_fl(handle, inode, map->m_lblk, +					 path, ar.len); +	if (!err) +		err = ext4_ext_insert_extent(handle, inode, path, +					     &newex, flags); -	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); -	if (err) { +	if (!err && set_unwritten) { +		if (io) +			ext4_set_io_unwritten_flag(inode, io); +		else +			ext4_set_inode_state(inode, +					     EXT4_STATE_DIO_UNWRITTEN); +	} + +	if (err && free_on_err) { +		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? +			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;  		/* free data blocks we just allocated */  		/* not a good idea to call discard here directly,  		 * but otherwise we'd need to call it every free() */  		ext4_discard_preallocations(inode); -		ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), -				 ext4_ext_get_actual_len(&newex), 0); +		ext4_free_blocks(handle, inode, NULL, newblock, +				 EXT4_C2B(sbi, allocated_clusters), fb_flags);  		goto out2;  	} @@ -3482,18 +4519,96 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	 * Update reserved blocks/metadata blocks after successful  	 * block allocation which had been deferred till now.  	 */ -	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) -		ext4_da_update_reserve_space(inode, allocated, 1); +	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { +		unsigned int reserved_clusters; +		/* +		 * Check how many clusters we had reserved this allocated range +		 */ +		reserved_clusters = get_reserved_cluster_alloc(inode, +						map->m_lblk, allocated); +		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { +			if (reserved_clusters) { +				/* +				 * We have clusters reserved for this range. +				 * But since we are not doing actual allocation +				 * and are simply using blocks from previously +				 * allocated cluster, we should release the +				 * reservation and not claim quota. +				 */ +				ext4_da_update_reserve_space(inode, +						reserved_clusters, 0); +			} +		} else { +			BUG_ON(allocated_clusters < reserved_clusters); +			if (reserved_clusters < allocated_clusters) { +				struct ext4_inode_info *ei = EXT4_I(inode); +				int reservation = allocated_clusters - +						  reserved_clusters; +				/* +				 * It seems we claimed few clusters outside of +				 * the range of this allocation. We should give +				 * it back to the reservation pool. This can +				 * happen in the following case: +				 * +				 * * Suppose s_cluster_ratio is 4 (i.e., each +				 *   cluster has 4 blocks. Thus, the clusters +				 *   are [0-3],[4-7],[8-11]... +				 * * First comes delayed allocation write for +				 *   logical blocks 10 & 11. Since there were no +				 *   previous delayed allocated blocks in the +				 *   range [8-11], we would reserve 1 cluster +				 *   for this write. +				 * * Next comes write for logical blocks 3 to 8. +				 *   In this case, we will reserve 2 clusters +				 *   (for [0-3] and [4-7]; and not for [8-11] as +				 *   that range has a delayed allocated blocks. +				 *   Thus total reserved clusters now becomes 3. +				 * * Now, during the delayed allocation writeout +				 *   time, we will first write blocks [3-8] and +				 *   allocate 3 clusters for writing these +				 *   blocks. Also, we would claim all these +				 *   three clusters above. +				 * * Now when we come here to writeout the +				 *   blocks [10-11], we would expect to claim +				 *   the reservation of 1 cluster we had made +				 *   (and we would claim it since there are no +				 *   more delayed allocated blocks in the range +				 *   [8-11]. But our reserved cluster count had +				 *   already gone to 0. +				 * +				 *   Thus, at the step 4 above when we determine +				 *   that there are still some unwritten delayed +				 *   allocated blocks outside of our current +				 *   block range, we should increment the +				 *   reserved clusters count so that when the +				 *   remaining blocks finally gets written, we +				 *   could claim them. +				 */ +				dquot_reserve_block(inode, +						EXT4_C2B(sbi, reservation)); +				spin_lock(&ei->i_block_reservation_lock); +				ei->i_reserved_data_blocks += reservation; +				spin_unlock(&ei->i_block_reservation_lock); +			} +			/* +			 * We will claim quota for all newly allocated blocks. +			 * We're updating the reserved space *after* the +			 * correction above so we do not accidentally free +			 * all the metadata reservation because we might +			 * actually need it later on. +			 */ +			ext4_da_update_reserve_space(inode, allocated_clusters, +							1); +		} +	}  	/*  	 * Cache the extent and update transaction to commit on fdatasync only -	 * when it is _not_ an uninitialized extent. +	 * when it is _not_ an unwritten extent.  	 */ -	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { -		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, -						EXT4_EXT_CACHE_EXTENT); +	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)  		ext4_update_inode_fsync_trans(handle, inode, 1); -	} else +	else  		ext4_update_inode_fsync_trans(handle, inode, 0);  out:  	if (allocated > map->m_len) @@ -3507,37 +4622,20 @@ out2:  		ext4_ext_drop_refs(path);  		kfree(path);  	} + +	trace_ext4_ext_map_blocks_exit(inode, flags, map, +				       err ? err : allocated); +	ext4_es_lru_add(inode);  	return err ? err : allocated;  } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode)  { -	struct address_space *mapping = inode->i_mapping;  	struct super_block *sb = inode->i_sb;  	ext4_lblk_t last_block; -	handle_t *handle;  	int err = 0;  	/* -	 * probably first extent we're gonna free will be last in block -	 */ -	err = ext4_writepage_trans_blocks(inode); -	handle = ext4_journal_start(inode, err); -	if (IS_ERR(handle)) -		return; - -	if (inode->i_size & (sb->s_blocksize - 1)) -		ext4_block_truncate_page(handle, mapping, inode->i_size); - -	if (ext4_orphan_add(handle, inode)) -		goto out_stop; - -	down_write(&EXT4_I(inode)->i_data_sem); -	ext4_ext_invalidate_cache(inode); - -	ext4_discard_preallocations(inode); - -	/*  	 * TODO: optimization is possible here.  	 * Probably we need not scan at all,  	 * because page truncation is enough. @@ -3549,78 +4647,258 @@ void ext4_ext_truncate(struct inode *inode)  	last_block = (inode->i_size + sb->s_blocksize - 1)  			>> EXT4_BLOCK_SIZE_BITS(sb); -	err = ext4_ext_remove_space(inode, last_block); +retry: +	err = ext4_es_remove_extent(inode, last_block, +				    EXT_MAX_BLOCKS - last_block); +	if (err == -ENOMEM) { +		cond_resched(); +		congestion_wait(BLK_RW_ASYNC, HZ/50); +		goto retry; +	} +	if (err) { +		ext4_std_error(inode->i_sb, err); +		return; +	} +	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); +	ext4_std_error(inode->i_sb, err); +} -	/* In a multi-transaction truncate, we only make the final -	 * transaction synchronous. +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, +				  ext4_lblk_t len, int flags, int mode) +{ +	struct inode *inode = file_inode(file); +	handle_t *handle; +	int ret = 0; +	int ret2 = 0; +	int retries = 0; +	struct ext4_map_blocks map; +	unsigned int credits; + +	map.m_lblk = offset; +	/* +	 * Don't normalize the request if it can fit in one extent so +	 * that it doesn't get unnecessarily split into multiple +	 * extents.  	 */ -	if (IS_SYNC(inode)) -		ext4_handle_sync(handle); +	if (len <= EXT_UNWRITTEN_MAX_LEN) +		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -out_stop: -	up_write(&EXT4_I(inode)->i_data_sem);  	/* -	 * If this was a simple ftruncate() and the file will remain alive, -	 * then we need to clear up the orphan record which we created above. -	 * However, if this was a real unlink then we were called by -	 * ext4_delete_inode(), and we allow that function to clean up the -	 * orphan info for us. +	 * credits to insert 1 extent into extent tree  	 */ -	if (inode->i_nlink) -		ext4_orphan_del(handle, inode); +	credits = ext4_chunk_trans_blocks(inode, len); -	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); -	ext4_mark_inode_dirty(handle, inode); -	ext4_journal_stop(handle); +retry: +	while (ret >= 0 && ret < len) { +		map.m_lblk = map.m_lblk + ret; +		map.m_len = len = len - ret; +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +					    credits); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			break; +		} +		ret = ext4_map_blocks(handle, inode, &map, flags); +		if (ret <= 0) { +			ext4_debug("inode #%lu: block %u: len %u: " +				   "ext4_ext_map_blocks returned %d", +				   inode->i_ino, map.m_lblk, +				   map.m_len, ret); +			ext4_mark_inode_dirty(handle, inode); +			ret2 = ext4_journal_stop(handle); +			break; +		} +		ret2 = ext4_journal_stop(handle); +		if (ret2) +			break; +	} +	if (ret == -ENOSPC && +			ext4_should_retry_alloc(inode->i_sb, &retries)) { +		ret = 0; +		goto retry; +	} + +	return ret > 0 ? ret2 : ret;  } -static void ext4_falloc_update_inode(struct inode *inode, -				int mode, loff_t new_size, int update_ctime) +static long ext4_zero_range(struct file *file, loff_t offset, +			    loff_t len, int mode)  { -	struct timespec now; +	struct inode *inode = file_inode(file); +	handle_t *handle = NULL; +	unsigned int max_blocks; +	loff_t new_size = 0; +	int ret = 0; +	int flags; +	int partial; +	loff_t start, end; +	ext4_lblk_t lblk; +	struct address_space *mapping = inode->i_mapping; +	unsigned int blkbits = inode->i_blkbits; + +	trace_ext4_zero_range(inode, offset, len, mode); + +	if (!S_ISREG(inode->i_mode)) +		return -EINVAL; + +	/* Call ext4_force_commit to flush all data in case of data=journal. */ +	if (ext4_should_journal_data(inode)) { +		ret = ext4_force_commit(inode->i_sb); +		if (ret) +			return ret; +	} -	if (update_ctime) { -		now = current_fs_time(inode->i_sb); -		if (!timespec_equal(&inode->i_ctime, &now)) -			inode->i_ctime = now; +	/* +	 * Write out all dirty pages to avoid race conditions +	 * Then release them. +	 */ +	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { +		ret = filemap_write_and_wait_range(mapping, offset, +						   offset + len - 1); +		if (ret) +			return ret;  	} +  	/* -	 * Update only when preallocation was requested beyond -	 * the file size. +	 * Round up offset. This is not fallocate, we neet to zero out +	 * blocks, so convert interior block aligned part of the range to +	 * unwritten and possibly manually zero out unaligned parts of the +	 * range.  	 */ -	if (!(mode & FALLOC_FL_KEEP_SIZE)) { +	start = round_up(offset, 1 << blkbits); +	end = round_down((offset + len), 1 << blkbits); + +	if (start < offset || end > offset + len) +		return -EINVAL; +	partial = (offset + len) & ((1 << blkbits) - 1); + +	lblk = start >> blkbits; +	max_blocks = (end >> blkbits); +	if (max_blocks < lblk) +		max_blocks = 0; +	else +		max_blocks -= lblk; + +	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | +		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; +	if (mode & FALLOC_FL_KEEP_SIZE) +		flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + +	mutex_lock(&inode->i_mutex); + +	/* +	 * Indirect files do not support unwritten extnets +	 */ +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { +		ret = -EOPNOTSUPP; +		goto out_mutex; +	} + +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +	     offset + len > i_size_read(inode)) { +		new_size = offset + len; +		ret = inode_newsize_ok(inode, new_size); +		if (ret) +			goto out_mutex; +		/* +		 * If we have a partial block after EOF we have to allocate +		 * the entire block. +		 */ +		if (partial) +			max_blocks += 1; +	} + +	if (max_blocks > 0) { + +		/* Now release the pages and zero block aligned part of pages*/ +		truncate_pagecache_range(inode, start, end - 1); + +		/* Wait all existing dio workers, newcomers will block on i_mutex */ +		ext4_inode_block_unlocked_dio(inode); +		inode_dio_wait(inode); + +		/* +		 * Remove entire range from the extent status tree. +		 */ +		ret = ext4_es_remove_extent(inode, lblk, max_blocks); +		if (ret) +			goto out_dio; + +		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, +					     mode); +		if (ret) +			goto out_dio; +	} + +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); +	if (IS_ERR(handle)) { +		ret = PTR_ERR(handle); +		ext4_std_error(inode->i_sb, ret); +		goto out_dio; +	} + +	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + +	if (new_size) {  		if (new_size > i_size_read(inode))  			i_size_write(inode, new_size);  		if (new_size > EXT4_I(inode)->i_disksize)  			ext4_update_i_disksize(inode, new_size);  	} else {  		/* -		 * Mark that we allocate beyond EOF so the subsequent truncate -		 * can proceed even if the new size is the same as i_size. -		 */ -		if (new_size > i_size_read(inode)) +		* Mark that we allocate beyond EOF so the subsequent truncate +		* can proceed even if the new size is the same as i_size. +		*/ +		if ((offset + len) > i_size_read(inode))  			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	} +	ext4_mark_inode_dirty(handle, inode); + +	/* Zero out partial block at the edges of the range */ +	ret = ext4_zero_partial_blocks(handle, inode, offset, len); + +	if (file->f_flags & O_SYNC) +		ext4_handle_sync(handle); + +	ext4_journal_stop(handle); +out_dio: +	ext4_inode_resume_unlocked_dio(inode); +out_mutex: +	mutex_unlock(&inode->i_mutex); +	return ret;  }  /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file   * operation, which gets called from sys_fallocate system call.   * For block-mapped files, posix_fallocate should fall back to the method   * of writing zeroes to the required new blocks (the same behavior which is   * expected for file systems which do not support fallocate() system call).   */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  { +	struct inode *inode = file_inode(file);  	handle_t *handle; -	loff_t new_size; +	loff_t new_size = 0;  	unsigned int max_blocks;  	int ret = 0; -	int ret2 = 0; -	int retries = 0; -	struct ext4_map_blocks map; -	unsigned int credits, blkbits = inode->i_blkbits; +	int flags; +	ext4_lblk_t lblk; +	struct timespec tv; +	unsigned int blkbits = inode->i_blkbits; + +	/* Return error if mode is not supported */ +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) +		return -EOPNOTSUPP; + +	if (mode & FALLOC_FL_PUNCH_HOLE) +		return ext4_punch_hole(inode, offset, len); + +	ret = ext4_convert_inline_data(inode); +	if (ret) +		return ret;  	/*  	 * currently supporting (pre)allocate mode for extent-based @@ -3629,70 +4907,69 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)  	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))  		return -EOPNOTSUPP; -	/* preallocation to directories is currently not supported */ -	if (S_ISDIR(inode->i_mode)) -		return -ENODEV; +	if (mode & FALLOC_FL_COLLAPSE_RANGE) +		return ext4_collapse_range(inode, offset, len); -	map.m_lblk = offset >> blkbits; +	if (mode & FALLOC_FL_ZERO_RANGE) +		return ext4_zero_range(file, offset, len, mode); + +	trace_ext4_fallocate_enter(inode, offset, len, mode); +	lblk = offset >> blkbits;  	/*  	 * We can't just convert len to max_blocks because  	 * If blocksize = 4096 offset = 3072 and len = 2048  	 */  	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -		- map.m_lblk; -	/* -	 * credits to insert 1 extent into extent tree -	 */ -	credits = ext4_chunk_trans_blocks(inode, max_blocks); +		- lblk; + +	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; +	if (mode & FALLOC_FL_KEEP_SIZE) +		flags |= EXT4_GET_BLOCKS_KEEP_SIZE; +  	mutex_lock(&inode->i_mutex); -	ret = inode_newsize_ok(inode, (len + offset)); -	if (ret) { -		mutex_unlock(&inode->i_mutex); -		return ret; -	} -retry: -	while (ret >= 0 && ret < max_blocks) { -		map.m_lblk = map.m_lblk + ret; -		map.m_len = max_blocks = max_blocks - ret; -		handle = ext4_journal_start(inode, credits); -		if (IS_ERR(handle)) { -			ret = PTR_ERR(handle); -			break; -		} -		ret = ext4_map_blocks(handle, inode, &map, -				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); -		if (ret <= 0) { -#ifdef EXT4FS_DEBUG -			WARN_ON(ret <= 0); -			printk(KERN_ERR "%s: ext4_ext_map_blocks " -				    "returned error inode#%lu, block=%u, " -				    "max_blocks=%u", __func__, -				    inode->i_ino, map.m_lblk, max_blocks); -#endif -			ext4_mark_inode_dirty(handle, inode); -			ret2 = ext4_journal_stop(handle); -			break; -		} -		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, -						blkbits) >> blkbits)) -			new_size = offset + len; -		else -			new_size = (map.m_lblk + ret) << blkbits; -		ext4_falloc_update_inode(inode, mode, new_size, -					 (map.m_flags & EXT4_MAP_NEW)); -		ext4_mark_inode_dirty(handle, inode); -		ret2 = ext4_journal_stop(handle); -		if (ret2) -			break; +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +	     offset + len > i_size_read(inode)) { +		new_size = offset + len; +		ret = inode_newsize_ok(inode, new_size); +		if (ret) +			goto out;  	} -	if (ret == -ENOSPC && -			ext4_should_retry_alloc(inode->i_sb, &retries)) { -		ret = 0; -		goto retry; + +	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); +	if (ret) +		goto out; + +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); +	if (IS_ERR(handle)) +		goto out; + +	tv = inode->i_ctime = ext4_current_time(inode); + +	if (new_size) { +		if (new_size > i_size_read(inode)) { +			i_size_write(inode, new_size); +			inode->i_mtime = tv; +		} +		if (new_size > EXT4_I(inode)->i_disksize) +			ext4_update_i_disksize(inode, new_size); +	} else { +		/* +		* Mark that we allocate beyond EOF so the subsequent truncate +		* can proceed even if the new size is the same as i_size. +		*/ +		if ((offset + len) > i_size_read(inode)) +			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	} +	ext4_mark_inode_dirty(handle, inode); +	if (file->f_flags & O_SYNC) +		ext4_handle_sync(handle); + +	ext4_journal_stop(handle); +out:  	mutex_unlock(&inode->i_mutex); -	return ret > 0 ? ret2 : ret; +	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); +	return ret;  }  /* @@ -3705,10 +4982,9 @@ retry:   * function, to convert the fallocated extents after IO is completed.   * Returns 0 on success.   */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, -				    ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, +				   loff_t offset, ssize_t len)  { -	handle_t *handle;  	unsigned int max_blocks;  	int ret = 0;  	int ret2 = 0; @@ -3723,109 +4999,98 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -  		      map.m_lblk);  	/* -	 * credits to insert 1 extent into extent tree +	 * This is somewhat ugly but the idea is clear: When transaction is +	 * reserved, everything goes into it. Otherwise we rather start several +	 * smaller transactions for conversion of each extent separately.  	 */ -	credits = ext4_chunk_trans_blocks(inode, max_blocks); +	if (handle) { +		handle = ext4_journal_start_reserved(handle, +						     EXT4_HT_EXT_CONVERT); +		if (IS_ERR(handle)) +			return PTR_ERR(handle); +		credits = 0; +	} else { +		/* +		 * credits to insert 1 extent into extent tree +		 */ +		credits = ext4_chunk_trans_blocks(inode, max_blocks); +	}  	while (ret >= 0 && ret < max_blocks) {  		map.m_lblk += ret;  		map.m_len = (max_blocks -= ret); -		handle = ext4_journal_start(inode, credits); -		if (IS_ERR(handle)) { -			ret = PTR_ERR(handle); -			break; +		if (credits) { +			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +						    credits); +			if (IS_ERR(handle)) { +				ret = PTR_ERR(handle); +				break; +			}  		}  		ret = ext4_map_blocks(handle, inode, &map,  				      EXT4_GET_BLOCKS_IO_CONVERT_EXT); -		if (ret <= 0) { -			WARN_ON(ret <= 0); -			printk(KERN_ERR "%s: ext4_ext_map_blocks " -				    "returned error inode#%lu, block=%u, " -				    "max_blocks=%u", __func__, -				    inode->i_ino, map.m_lblk, map.m_len); -		} +		if (ret <= 0) +			ext4_warning(inode->i_sb, +				     "inode #%lu: block %u: len %u: " +				     "ext4_ext_map_blocks returned %d", +				     inode->i_ino, map.m_lblk, +				     map.m_len, ret);  		ext4_mark_inode_dirty(handle, inode); -		ret2 = ext4_journal_stop(handle); -		if (ret <= 0 || ret2 ) +		if (credits) +			ret2 = ext4_journal_stop(handle); +		if (ret <= 0 || ret2)  			break;  	} +	if (!credits) +		ret2 = ext4_journal_stop(handle);  	return ret > 0 ? ret2 : ret;  } +  /* - * Callback function called for each extent to gather FIEMAP information. + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and + * return start of the next delayed extent. + * + * If newes is existing extent (newes->ec_pblk is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newes unmodified.   */ -static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, -		       struct ext4_ext_cache *newex, struct ext4_extent *ex, -		       void *data) +static int ext4_find_delayed_extent(struct inode *inode, +				    struct extent_status *newes)  { -	struct fiemap_extent_info *fieinfo = data; -	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; -	__u64	logical; -	__u64	physical; -	__u64	length; -	__u32	flags = 0; -	int	error; - -	logical =  (__u64)newex->ec_block << blksize_bits; - -	if (newex->ec_type == EXT4_EXT_CACHE_GAP) { -		pgoff_t offset; -		struct page *page; -		struct buffer_head *bh = NULL; +	struct extent_status es; +	ext4_lblk_t block, next_del; -		offset = logical >> PAGE_SHIFT; -		page = find_get_page(inode->i_mapping, offset); -		if (!page || !page_has_buffers(page)) -			return EXT_CONTINUE; +	if (newes->es_pblk == 0) { +		ext4_es_find_delayed_extent_range(inode, newes->es_lblk, +				newes->es_lblk + newes->es_len - 1, &es); -		bh = page_buffers(page); - -		if (!bh) -			return EXT_CONTINUE; +		/* +		 * No extent in extent-tree contains block @newes->es_pblk, +		 * then the block may stay in 1)a hole or 2)delayed-extent. +		 */ +		if (es.es_len == 0) +			/* A hole found. */ +			return 0; -		if (buffer_delay(bh)) { -			flags |= FIEMAP_EXTENT_DELALLOC; -			page_cache_release(page); -		} else { -			page_cache_release(page); -			return EXT_CONTINUE; +		if (es.es_lblk > newes->es_lblk) { +			/* A hole found. */ +			newes->es_len = min(es.es_lblk - newes->es_lblk, +					    newes->es_len); +			return 0;  		} -	} - -	physical = (__u64)newex->ec_start << blksize_bits; -	length =   (__u64)newex->ec_len << blksize_bits; - -	if (ex && ext4_ext_is_uninitialized(ex)) -		flags |= FIEMAP_EXTENT_UNWRITTEN; - -	/* -	 * If this extent reaches EXT_MAX_BLOCK, it must be last. -	 * -	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, -	 * this also indicates no more allocated blocks. -	 * -	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK -	 */ -	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || -	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { -		loff_t size = i_size_read(inode); -		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); -		flags |= FIEMAP_EXTENT_LAST; -		if ((flags & FIEMAP_EXTENT_DELALLOC) && -		    logical+length > size) -			length = (size - logical + bs - 1) & ~(bs-1); +		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;  	} -	error = fiemap_fill_next_extent(fieinfo, logical, physical, -					length, flags); -	if (error < 0) -		return error; -	if (error == 1) -		return EXT_BREAK; +	block = newes->es_lblk + newes->es_len; +	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); +	if (es.es_len == 0) +		next_del = EXT_MAX_BLOCKS; +	else +		next_del = es.es_lblk; -	return EXT_CONTINUE; +	return next_del;  } -  /* fiemap flags we can handle specified here */  #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -3846,7 +5111,7 @@ static int ext4_xattr_fiemap(struct inode *inode,  		error = ext4_get_inode_loc(inode, &iloc);  		if (error)  			return error; -		physical = iloc.bh->b_blocknr << blockbits; +		physical = (__u64)iloc.bh->b_blocknr << blockbits;  		offset = EXT4_GOOD_OLD_INODE_SIZE +  				EXT4_I(inode)->i_extra_isize;  		physical += offset; @@ -3854,7 +5119,7 @@ static int ext4_xattr_fiemap(struct inode *inode,  		flags |= FIEMAP_EXTENT_DATA_INLINE;  		brelse(iloc.bh);  	} else { /* external block */ -		physical = EXT4_I(inode)->i_file_acl << blockbits; +		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;  		length = inode->i_sb->s_blocksize;  	} @@ -3870,6 +5135,21 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  	ext4_lblk_t start_blk;  	int error = 0; +	if (ext4_has_inline_data(inode)) { +		int has_inline = 1; + +		error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + +		if (has_inline) +			return error; +	} + +	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { +		error = ext4_ext_precache(inode); +		if (error) +			return error; +	} +  	/* fallback to generic here if not in extents fmt */  	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))  		return generic_block_fiemap(inode, fieinfo, start, len, @@ -3886,18 +5166,347 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  		start_blk = start >> inode->i_sb->s_blocksize_bits;  		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; -		if (last_blk >= EXT_MAX_BLOCK) -			last_blk = EXT_MAX_BLOCK-1; +		if (last_blk >= EXT_MAX_BLOCKS) +			last_blk = EXT_MAX_BLOCKS-1;  		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;  		/* -		 * Walk the extent tree gathering extent information. -		 * ext4_ext_fiemap_cb will push extents back to user. +		 * Walk the extent tree gathering extent information +		 * and pushing extents back to the user.  		 */ -		error = ext4_ext_walk_space(inode, start_blk, len_blks, -					  ext4_ext_fiemap_cb, fieinfo); +		error = ext4_fill_fiemap_extents(inode, start_blk, +						 len_blks, fieinfo);  	} - +	ext4_es_lru_add(inode);  	return error;  } +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, +		struct ext4_ext_path *path) +{ +	int credits, err; + +	if (!ext4_handle_valid(handle)) +		return 0; + +	/* +	 * Check if need to extend journal credits +	 * 3 for leaf, sb, and inode plus 2 (bmap and group +	 * descriptor) for each block group; assume two block +	 * groups +	 */ +	if (handle->h_buffer_credits < 7) { +		credits = ext4_writepage_trans_blocks(inode); +		err = ext4_ext_truncate_extend_restart(handle, inode, credits); +		/* EAGAIN is success */ +		if (err && err != -EAGAIN) +			return err; +	} + +	err = ext4_ext_get_access(handle, inode, path); +	return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, +			    struct inode *inode, handle_t *handle, +			    ext4_lblk_t *start) +{ +	int depth, err = 0; +	struct ext4_extent *ex_start, *ex_last; +	bool update = 0; +	depth = path->p_depth; + +	while (depth >= 0) { +		if (depth == path->p_depth) { +			ex_start = path[depth].p_ext; +			if (!ex_start) +				return -EIO; + +			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); +			if (!ex_last) +				return -EIO; + +			err = ext4_access_path(handle, inode, path + depth); +			if (err) +				goto out; + +			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) +				update = 1; + +			*start = le32_to_cpu(ex_last->ee_block) + +				ext4_ext_get_actual_len(ex_last); + +			while (ex_start <= ex_last) { +				le32_add_cpu(&ex_start->ee_block, -shift); +				/* Try to merge to the left. */ +				if ((ex_start > +				     EXT_FIRST_EXTENT(path[depth].p_hdr)) && +				    ext4_ext_try_to_merge_right(inode, +							path, ex_start - 1)) +					ex_last--; +				else +					ex_start++; +			} +			err = ext4_ext_dirty(handle, inode, path + depth); +			if (err) +				goto out; + +			if (--depth < 0 || !update) +				break; +		} + +		/* Update index too */ +		err = ext4_access_path(handle, inode, path + depth); +		if (err) +			goto out; + +		le32_add_cpu(&path[depth].p_idx->ei_block, -shift); +		err = ext4_ext_dirty(handle, inode, path + depth); +		if (err) +			goto out; + +		/* we are done if current index is not a starting index */ +		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) +			break; + +		depth--; +	} + +out: +	return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, +		       ext4_lblk_t start, ext4_lblk_t shift) +{ +	struct ext4_ext_path *path; +	int ret = 0, depth; +	struct ext4_extent *extent; +	ext4_lblk_t stop_block, current_block; +	ext4_lblk_t ex_start, ex_end; + +	/* Let path point to the last extent */ +	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); +	if (IS_ERR(path)) +		return PTR_ERR(path); + +	depth = path->p_depth; +	extent = path[depth].p_ext; +	if (!extent) { +		ext4_ext_drop_refs(path); +		kfree(path); +		return ret; +	} + +	stop_block = le32_to_cpu(extent->ee_block) + +			ext4_ext_get_actual_len(extent); +	ext4_ext_drop_refs(path); +	kfree(path); + +	/* Nothing to shift, if hole is at the end of file */ +	if (start >= stop_block) +		return ret; + +	/* +	 * Don't start shifting extents until we make sure the hole is big +	 * enough to accomodate the shift. +	 */ +	path = ext4_ext_find_extent(inode, start - 1, NULL, 0); +	if (IS_ERR(path)) +		return PTR_ERR(path); +	depth = path->p_depth; +	extent =  path[depth].p_ext; +	if (extent) { +		ex_start = le32_to_cpu(extent->ee_block); +		ex_end = le32_to_cpu(extent->ee_block) + +			ext4_ext_get_actual_len(extent); +	} else { +		ex_start = 0; +		ex_end = 0; +	} +	ext4_ext_drop_refs(path); +	kfree(path); + +	if ((start == ex_start && shift > ex_start) || +	    (shift > start - ex_end)) +		return -EINVAL; + +	/* Its safe to start updating extents */ +	while (start < stop_block) { +		path = ext4_ext_find_extent(inode, start, NULL, 0); +		if (IS_ERR(path)) +			return PTR_ERR(path); +		depth = path->p_depth; +		extent = path[depth].p_ext; +		if (!extent) { +			EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +					 (unsigned long) start); +			return -EIO; +		} + +		current_block = le32_to_cpu(extent->ee_block); +		if (start > current_block) { +			/* Hole, move to the next extent */ +			ret = mext_next_extent(inode, path, &extent); +			if (ret != 0) { +				ext4_ext_drop_refs(path); +				kfree(path); +				if (ret == 1) +					ret = 0; +				break; +			} +		} +		ret = ext4_ext_shift_path_extents(path, shift, inode, +				handle, &start); +		ext4_ext_drop_refs(path); +		kfree(path); +		if (ret) +			break; +	} + +	return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ +	struct super_block *sb = inode->i_sb; +	ext4_lblk_t punch_start, punch_stop; +	handle_t *handle; +	unsigned int credits; +	loff_t new_size, ioffset; +	int ret; + +	/* Collapse range works only on fs block size aligned offsets. */ +	if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || +	    len & (EXT4_BLOCK_SIZE(sb) - 1)) +		return -EINVAL; + +	if (!S_ISREG(inode->i_mode)) +		return -EINVAL; + +	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) +		return -EOPNOTSUPP; + +	trace_ext4_collapse_range(inode, offset, len); + +	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); +	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + +	/* Call ext4_force_commit to flush all data in case of data=journal. */ +	if (ext4_should_journal_data(inode)) { +		ret = ext4_force_commit(inode->i_sb); +		if (ret) +			return ret; +	} + +	/* +	 * Need to round down offset to be aligned with page size boundary +	 * for page size > block size. +	 */ +	ioffset = round_down(offset, PAGE_SIZE); + +	/* Write out all dirty pages */ +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +					   LLONG_MAX); +	if (ret) +		return ret; + +	/* Take mutex lock */ +	mutex_lock(&inode->i_mutex); + +	/* +	 * There is no need to overlap collapse range with EOF, in which case +	 * it is effectively a truncate operation +	 */ +	if (offset + len >= i_size_read(inode)) { +		ret = -EINVAL; +		goto out_mutex; +	} + +	/* Currently just for extent based files */ +	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { +		ret = -EOPNOTSUPP; +		goto out_mutex; +	} + +	truncate_pagecache(inode, ioffset); + +	/* Wait for existing dio to complete */ +	ext4_inode_block_unlocked_dio(inode); +	inode_dio_wait(inode); + +	credits = ext4_writepage_trans_blocks(inode); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); +	if (IS_ERR(handle)) { +		ret = PTR_ERR(handle); +		goto out_dio; +	} + +	down_write(&EXT4_I(inode)->i_data_sem); +	ext4_discard_preallocations(inode); + +	ret = ext4_es_remove_extent(inode, punch_start, +				    EXT_MAX_BLOCKS - punch_start); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} + +	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} +	ext4_discard_preallocations(inode); + +	ret = ext4_ext_shift_extents(inode, handle, punch_stop, +				     punch_stop - punch_start); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} + +	new_size = i_size_read(inode) - len; +	i_size_write(inode, new_size); +	EXT4_I(inode)->i_disksize = new_size; + +	up_write(&EXT4_I(inode)->i_data_sem); +	if (IS_SYNC(inode)) +		ext4_handle_sync(handle); +	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); +	ext4_mark_inode_dirty(handle, inode); + +out_stop: +	ext4_journal_stop(handle); +out_dio: +	ext4_inode_resume_unlocked_dio(inode); +out_mutex: +	mutex_unlock(&inode->i_mutex); +	return ret; +}  | 
