diff options
Diffstat (limited to 'fs/ext4')
| -rw-r--r-- | fs/ext4/acl.c | 223 | ||||
| -rw-r--r-- | fs/ext4/acl.h | 9 | ||||
| -rw-r--r-- | fs/ext4/balloc.c | 97 | ||||
| -rw-r--r-- | fs/ext4/block_validity.c | 33 | ||||
| -rw-r--r-- | fs/ext4/dir.c | 38 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 121 | ||||
| -rw-r--r-- | fs/ext4/ext4_extents.h | 22 | ||||
| -rw-r--r-- | fs/ext4/ext4_jbd2.c | 26 | ||||
| -rw-r--r-- | fs/ext4/ext4_jbd2.h | 4 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 1130 | ||||
| -rw-r--r-- | fs/ext4/extents_status.c | 42 | ||||
| -rw-r--r-- | fs/ext4/extents_status.h | 9 | ||||
| -rw-r--r-- | fs/ext4/file.c | 180 | ||||
| -rw-r--r-- | fs/ext4/ialloc.c | 39 | ||||
| -rw-r--r-- | fs/ext4/indirect.c | 38 | ||||
| -rw-r--r-- | fs/ext4/inline.c | 44 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 385 | ||||
| -rw-r--r-- | fs/ext4/ioctl.c | 37 | ||||
| -rw-r--r-- | fs/ext4/mballoc.c | 80 | ||||
| -rw-r--r-- | fs/ext4/mballoc.h | 4 | ||||
| -rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
| -rw-r--r-- | fs/ext4/mmp.c | 6 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 84 | ||||
| -rw-r--r-- | fs/ext4/namei.c | 623 | ||||
| -rw-r--r-- | fs/ext4/page-io.c | 50 | ||||
| -rw-r--r-- | fs/ext4/resize.c | 49 | ||||
| -rw-r--r-- | fs/ext4/super.c | 269 | ||||
| -rw-r--r-- | fs/ext4/xattr.c | 102 | ||||
| -rw-r--r-- | fs/ext4/xattr.h | 8 | 
29 files changed, 2321 insertions, 1433 deletions
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 39a54a0e9fe..d40c8dbbb0d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)  	struct posix_acl *acl;  	int retval; -	if (!test_opt(inode->i_sb, POSIX_ACL)) -		return NULL; - -	acl = get_cached_acl(inode, type); -	if (acl != ACL_NOT_CACHED) -		return acl; -  	switch (type) {  	case ACL_TYPE_ACCESS:  		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)   * inode->i_mutex: down unless called from ext4_new_inode   */  static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, +__ext4_set_acl(handle_t *handle, struct inode *inode, int type,  	     struct posix_acl *acl)  {  	int name_index; @@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,  	size_t size = 0;  	int error; -	if (S_ISLNK(inode->i_mode)) -		return -EOPNOTSUPP; -  	switch (type) {  	case ACL_TYPE_ACCESS:  		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,  	return error;  } -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */  int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)  { -	struct posix_acl *acl = NULL; -	int error = 0; - -	if (!S_ISLNK(inode->i_mode)) { -		if (test_opt(dir->i_sb, POSIX_ACL)) { -			acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); -			if (IS_ERR(acl)) -				return PTR_ERR(acl); -		} -		if (!acl) -			inode->i_mode &= ~current_umask(); -	} -	if (test_opt(inode->i_sb, POSIX_ACL) && acl) { -		if (S_ISDIR(inode->i_mode)) { -			error = ext4_set_acl(handle, inode, -					     ACL_TYPE_DEFAULT, acl); -			if (error) -				goto cleanup; -		} -		error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); -		if (error < 0) -			return error; - -		if (error > 0) { -			/* This is an extended ACL */ -			error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); -		} -	} -cleanup: -	posix_acl_release(acl); -	return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like  ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext4_acl_chmod(struct inode *inode) -{ -	struct posix_acl *acl;  	handle_t *handle; -	int retries = 0; -	int error; - +	int error, retries = 0; -	if (S_ISLNK(inode->i_mode)) -		return -EOPNOTSUPP; -	if (!test_opt(inode->i_sb, POSIX_ACL)) -		return 0; -	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); -	if (IS_ERR(acl) || !acl) -		return PTR_ERR(acl); -	error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); -	if (error) -		return error;  retry:  	handle = ext4_journal_start(inode, EXT4_HT_XATTR,  				    ext4_jbd2_credits_xattr(inode)); -	if (IS_ERR(handle)) { -		error = PTR_ERR(handle); -		ext4_std_error(inode->i_sb, error); -		goto out; -	} -	error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); +	if (IS_ERR(handle)) +		return PTR_ERR(handle); + +	error = __ext4_set_acl(handle, inode, type, acl);  	ext4_journal_stop(handle); -	if (error == -ENOSPC && -	    ext4_should_retry_alloc(inode->i_sb, &retries)) +	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))  		goto retry; -out: -	posix_acl_release(acl);  	return error;  }  /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive)   */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, -			   const char *name, size_t name_len, int type) -{ -	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - -	if (!test_opt(dentry->d_sb, POSIX_ACL)) -		return 0; -	if (list && size <= list_len) -		memcpy(list, POSIX_ACL_XATTR_ACCESS, size); -	return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, -			    const char *name, size_t name_len, int type) -{ -	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - -	if (!test_opt(dentry->d_sb, POSIX_ACL)) -		return 0; -	if (list && size <= list_len) -		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); -	return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, -		   size_t size, int type) +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)  { -	struct posix_acl *acl; +	struct posix_acl *default_acl, *acl;  	int error; -	if (strcmp(name, "") != 0) -		return -EINVAL; -	if (!test_opt(dentry->d_sb, POSIX_ACL)) -		return -EOPNOTSUPP; - -	acl = ext4_get_acl(dentry->d_inode, type); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	if (acl == NULL) -		return -ENODATA; -	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); -	posix_acl_release(acl); - -	return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, -		   size_t size, int flags, int type) -{ -	struct inode *inode = dentry->d_inode; -	handle_t *handle; -	struct posix_acl *acl; -	int error, retries = 0; - -	if (strcmp(name, "") != 0) -		return -EINVAL; -	if (!test_opt(inode->i_sb, POSIX_ACL)) -		return -EOPNOTSUPP; -	if (!inode_owner_or_capable(inode)) -		return -EPERM; - -	if (value) { -		acl = posix_acl_from_xattr(&init_user_ns, value, size); -		if (IS_ERR(acl)) -			return PTR_ERR(acl); -		else if (acl) { -			error = posix_acl_valid(acl); -			if (error) -				goto release_and_out; -		} -	} else -		acl = NULL; +	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); +	if (error) +		return error; -retry: -	handle = ext4_journal_start(inode, EXT4_HT_XATTR, -				    ext4_jbd2_credits_xattr(inode)); -	if (IS_ERR(handle)) { -		error = PTR_ERR(handle); -		goto release_and_out; +	if (default_acl) { +		error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, +				       default_acl); +		posix_acl_release(default_acl); +	} +	if (acl) { +		if (!error) +			error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, +					       acl); +		posix_acl_release(acl);  	} -	error = ext4_set_acl(handle, inode, type, acl); -	ext4_journal_stop(handle); -	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -		goto retry; - -release_and_out: -	posix_acl_release(acl);  	return error;  } - -const struct xattr_handler ext4_xattr_acl_access_handler = { -	.prefix	= POSIX_ACL_XATTR_ACCESS, -	.flags	= ACL_TYPE_ACCESS, -	.list	= ext4_xattr_list_acl_access, -	.get	= ext4_xattr_get_acl, -	.set	= ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { -	.prefix	= POSIX_ACL_XATTR_DEFAULT, -	.flags	= ACL_TYPE_DEFAULT, -	.list	= ext4_xattr_list_acl_default, -	.get	= ext4_xattr_get_acl, -	.set	= ext4_xattr_set_acl, -}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 18cb39ed7c7..da2c79577d7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)  /* acl.c */  struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);  extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);  #else  /* CONFIG_EXT4_FS_POSIX_ACL */  #include <linux/sched.h>  #define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ -	return 0; -} +#define ext4_set_acl NULL  static inline int  ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index dc5d572ebd6..fca382037dd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb,  /* Return the number of clusters used for file system metadata; this   * represents the overhead needed by the file system.   */ -unsigned ext4_num_overhead_clusters(struct super_block *sb, -				    ext4_group_t block_group, -				    struct ext4_group_desc *gdp) +static unsigned ext4_num_overhead_clusters(struct super_block *sb, +					   ext4_group_t block_group, +					   struct ext4_group_desc *gdp)  {  	unsigned num_clusters;  	int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; @@ -176,9 +176,10 @@ static unsigned int num_clusters_in_group(struct super_block *sb,  }  /* Initializes an uninitialized block bitmap */ -void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, -			    ext4_group_t block_group, -			    struct ext4_group_desc *gdp) +static void ext4_init_block_bitmap(struct super_block *sb, +				   struct buffer_head *bh, +				   ext4_group_t block_group, +				   struct ext4_group_desc *gdp)  {  	unsigned int bit, bit_max;  	struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -193,7 +194,16 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,  	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {  		ext4_error(sb, "Checksum bad for group %u", block_group);  		grp = ext4_get_group_info(sb, block_group); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free);  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, gdp); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		}  		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);  		return;  	} @@ -307,6 +317,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,  					    ext4_group_t block_group,  					    struct buffer_head *bh)  { +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	ext4_grpblk_t offset;  	ext4_grpblk_t next_zero_bit;  	ext4_fsblk_t blk; @@ -326,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,  	/* check whether block bitmap block number is set */  	blk = ext4_block_bitmap(sb, desc);  	offset = blk - group_first_block; -	if (!ext4_test_bit(offset, bh->b_data)) +	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))  		/* bad block bitmap */  		return blk;  	/* check whether the inode bitmap block number is set */  	blk = ext4_inode_bitmap(sb, desc);  	offset = blk - group_first_block; -	if (!ext4_test_bit(offset, bh->b_data)) +	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))  		/* bad block bitmap */  		return blk; @@ -341,21 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,  	blk = ext4_inode_table(sb, desc);  	offset = blk - group_first_block;  	next_zero_bit = ext4_find_next_zero_bit(bh->b_data, -				offset + EXT4_SB(sb)->s_itb_per_group, -				offset); -	if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group) +			EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), +			EXT4_B2C(sbi, offset)); +	if (next_zero_bit < +	    EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))  		/* bad bitmap for inode tables */  		return blk;  	return 0;  } -void ext4_validate_block_bitmap(struct super_block *sb, -			       struct ext4_group_desc *desc, -			       ext4_group_t block_group, -			       struct buffer_head *bh) +static void ext4_validate_block_bitmap(struct super_block *sb, +				       struct ext4_group_desc *desc, +				       ext4_group_t block_group, +				       struct buffer_head *bh)  {  	ext4_fsblk_t	blk;  	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	if (buffer_verified(bh))  		return; @@ -366,6 +379,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,  		ext4_unlock_group(sb, block_group);  		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",  			   block_group, blk); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free);  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);  		return;  	} @@ -373,6 +389,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,  			desc, bh))) {  		ext4_unlock_group(sb, block_group);  		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free);  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);  		return;  	} @@ -640,6 +659,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)  	struct ext4_group_desc *gdp;  	ext4_group_t i;  	ext4_group_t ngroups = ext4_get_groups_count(sb); +	struct ext4_group_info *grp;  #ifdef EXT4FS_DEBUG  	struct ext4_super_block *es;  	ext4_fsblk_t bitmap_count; @@ -655,14 +675,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)  		gdp = ext4_get_group_desc(sb, i, NULL);  		if (!gdp)  			continue; -		desc_count += ext4_free_group_clusters(sb, gdp); +		grp = NULL; +		if (EXT4_SB(sb)->s_group_info) +			grp = ext4_get_group_info(sb, i); +		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			desc_count += ext4_free_group_clusters(sb, gdp);  		brelse(bitmap_bh);  		bitmap_bh = ext4_read_block_bitmap(sb, i);  		if (bitmap_bh == NULL)  			continue;  		x = ext4_count_free(bitmap_bh->b_data, -				    EXT4_BLOCKS_PER_GROUP(sb) / 8); +				    EXT4_CLUSTERS_PER_GROUP(sb) / 8);  		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",  			i, ext4_free_group_clusters(sb, gdp), x);  		bitmap_count += x; @@ -679,7 +703,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)  		gdp = ext4_get_group_desc(sb, i, NULL);  		if (!gdp)  			continue; -		desc_count += ext4_free_group_clusters(sb, gdp); +		grp = NULL; +		if (EXT4_SB(sb)->s_group_info) +			grp = ext4_get_group_info(sb, i); +		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			desc_count += ext4_free_group_clusters(sb, gdp);  	}  	return desc_count; @@ -699,16 +727,6 @@ static inline int test_root(ext4_group_t a, int b)  	}  } -static int ext4_group_sparse(ext4_group_t group) -{ -	if (group <= 1) -		return 1; -	if (!(group & 1)) -		return 0; -	return (test_root(group, 7) || test_root(group, 5) || -		test_root(group, 3)); -} -  /**   *	ext4_bg_has_super - number of blocks used by the superblock in group   *	@sb: superblock for filesystem @@ -719,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)   */  int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)  { -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, -				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && -			!ext4_group_sparse(group)) +	struct ext4_super_block *es = EXT4_SB(sb)->s_es; + +	if (group == 0) +		return 1; +	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { +		if (group == le32_to_cpu(es->s_backup_bgs[0]) || +		    group == le32_to_cpu(es->s_backup_bgs[1])) +			return 1;  		return 0; -	return 1; +	} +	if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, +					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) +		return 1; +	if (!(group & 1)) +		return 0; +	if (test_root(group, 3) || (test_root(group, 5)) || +	    test_root(group, 7)) +		return 1; + +	return 0;  }  static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3f11656bd72..41eb9dcfac7 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)  /* Called when the filesystem is unmounted */  void ext4_release_system_zone(struct super_block *sb)  { -	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node; -	struct rb_node	*parent; -	struct ext4_system_zone	*entry; +	struct ext4_system_zone	*entry, *n; -	while (n) { -		/* Do the node's children first */ -		if (n->rb_left) { -			n = n->rb_left; -			continue; -		} -		if (n->rb_right) { -			n = n->rb_right; -			continue; -		} -		/* -		 * The node has no children; free it, and then zero -		 * out parent's link to it.  Finally go to the -		 * beginning of the loop and try to free the parent -		 * node. -		 */ -		parent = rb_parent(n); -		entry = rb_entry(n, struct ext4_system_zone, node); +	rbtree_postorder_for_each_entry_safe(entry, n, +			&EXT4_SB(sb)->system_blks, node)  		kmem_cache_free(ext4_system_zone_cachep, entry); -		if (!parent) -			EXT4_SB(sb)->system_blks = RB_ROOT; -		else if (parent->rb_left == n) -			parent->rb_left = NULL; -		else if (parent->rb_right == n) -			parent->rb_right = NULL; -		n = parent; -	} +  	EXT4_SB(sb)->system_blks = RB_ROOT;  } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 680bb338891..ef1bed66c14 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -105,7 +105,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,  static int ext4_readdir(struct file *file, struct dir_context *ctx)  {  	unsigned int offset; -	int i, stored; +	int i;  	struct ext4_dir_entry_2 *de;  	int err;  	struct inode *inode = file_inode(file); @@ -133,7 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)  			return ret;  	} -	stored = 0;  	offset = ctx->pos & (sb->s_blocksize - 1);  	while (ctx->pos < inode->i_size) { @@ -353,41 +352,16 @@ struct fname {   */  static void free_rb_tree_fname(struct rb_root *root)  { -	struct rb_node	*n = root->rb_node; -	struct rb_node	*parent; -	struct fname	*fname; - -	while (n) { -		/* Do the node's children first */ -		if (n->rb_left) { -			n = n->rb_left; -			continue; -		} -		if (n->rb_right) { -			n = n->rb_right; -			continue; -		} -		/* -		 * The node has no children; free it, and then zero -		 * out parent's link to it.  Finally go to the -		 * beginning of the loop and try to free the parent -		 * node. -		 */ -		parent = rb_parent(n); -		fname = rb_entry(n, struct fname, rb_hash); +	struct fname *fname, *next; + +	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)  		while (fname) {  			struct fname *old = fname;  			fname = fname->next;  			kfree(old);  		} -		if (!parent) -			*root = RB_ROOT; -		else if (parent->rb_left == n) -			parent->rb_left = NULL; -		else if (parent->rb_right == n) -			parent->rb_right = NULL; -		n = parent; -	} + +	*root = RB_ROOT;  } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af815ea9d7c..7cc5a0e2368 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,7 +29,9 @@  #include <linux/wait.h>  #include <linux/blockgroup_lock.h>  #include <linux/percpu_counter.h> +#include <linux/ratelimit.h>  #include <crypto/hash.h> +#include <linux/falloc.h>  #ifdef __KERNEL__  #include <linux/compat.h>  #endif @@ -156,7 +158,6 @@ struct ext4_allocation_request {  #define EXT4_MAP_MAPPED		(1 << BH_Mapped)  #define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)  #define EXT4_MAP_BOUNDARY	(1 << BH_Boundary) -#define EXT4_MAP_UNINIT		(1 << BH_Uninit)  /* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of   * ext4_map_blocks wants to know whether or not the underlying cluster has   * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that @@ -167,7 +168,7 @@ struct ext4_allocation_request {  #define EXT4_MAP_FROM_CLUSTER	(1 << BH_AllocFromCluster)  #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\  				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ -				 EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) +				 EXT4_MAP_FROM_CLUSTER)  struct ext4_map_blocks {  	ext4_fsblk_t m_pblk; @@ -182,7 +183,7 @@ struct ext4_map_blocks {  #define	EXT4_IO_END_UNWRITTEN	0x0001  /* - * For converting uninitialized extents on a work queue. 'handle' is used for + * For converting unwritten extents on a work queue. 'handle' is used for   * buffered writeback.   */  typedef struct ext4_io_end { @@ -267,6 +268,16 @@ struct ext4_io_submit {  /* Translate # of blks to # of clusters */  #define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \  				 (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\ +				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\ +				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\ +				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\ +				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))  /*   * Structure of a blocks group descriptor @@ -525,26 +536,26 @@ enum {  /*   * Flags used by ext4_map_blocks()   */ -	/* Allocate any needed blocks and/or convert an unitialized +	/* Allocate any needed blocks and/or convert an unwritten  	   extent to be an initialized ext4 */  #define EXT4_GET_BLOCKS_CREATE			0x0001 -	/* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\ +	/* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT		0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT	(EXT4_GET_BLOCKS_UNWRIT_EXT|\  						 EXT4_GET_BLOCKS_CREATE)  	/* Caller is from the delayed allocation writeout path  	 * finally doing the actual allocation of delayed blocks */  #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004  	/* caller is from the direct IO path, request to creation of an -	unitialized extents if not allocated, split the uninitialized +	unwritten extents if not allocated, split the unwritten  	extent if blocks has been preallocated already*/  #define EXT4_GET_BLOCKS_PRE_IO			0x0008  #define EXT4_GET_BLOCKS_CONVERT			0x0010  #define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\ -					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) +					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)  	/* Convert extent to initialized after IO complete */  #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\ -					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) +					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)  	/* Eventual metadata allocation (due to growing extent tree)  	 * should not fail, so try to use reserved blocks for that.*/  #define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020 @@ -556,6 +567,8 @@ enum {  #define EXT4_GET_BLOCKS_NO_LOCK			0x0100  	/* Do not put hole in extent cache */  #define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200 +	/* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0400  /*   * The bit position of these flags must not overlap with any of the @@ -760,6 +773,8 @@ do {									       \  	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \  		(einode)->xtime.tv_sec = 				       \  			(signed)le32_to_cpu((raw_inode)->xtime);	       \ +	else								       \ +		(einode)->xtime.tv_sec = 0;				       \  	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \  		ext4_decode_extra_time(&(einode)->xtime,		       \  				       raw_inode->xtime ## _extra);	       \ @@ -860,6 +875,8 @@ struct ext4_inode_info {  	struct inode vfs_inode;  	struct jbd2_inode *jinode; +	spinlock_t i_raw_lock;	/* protects updates to the raw inode */ +  	/*  	 * File creation time. Its function is same as that of  	 * struct timespec i_{a,c,m}time in the generic inode. @@ -985,6 +1002,8 @@ struct ext4_inode_info {  #define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group  						      size of blocksize * 8  						      blocks */ +#define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated +						      file systems */  #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \  						~EXT4_MOUNT_##opt @@ -1141,7 +1160,8 @@ struct ext4_super_block {  	__le32	s_usr_quota_inum;	/* inode for tracking user quota */  	__le32	s_grp_quota_inum;	/* inode for tracking group quota */  	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */ -	__le32	s_reserved[108];	/* Padding to the end of the block */ +	__le32	s_backup_bgs[2];	/* groups with sparse_super2 SBs */ +	__le32	s_reserved[106];	/* Padding to the end of the block */  	__le32	s_checksum;		/* crc32c(superblock) */  }; @@ -1313,7 +1333,13 @@ struct ext4_sb_info {  	struct list_head s_es_lru;  	unsigned long s_es_last_sorted;  	struct percpu_counter s_extent_cache_cnt; +	struct mb_cache *s_mb_cache;  	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + +	/* Ratelimit ext4 messages. */ +	struct ratelimit_state s_err_ratelimit_state; +	struct ratelimit_state s_warning_ratelimit_state; +	struct ratelimit_state s_msg_ratelimit_state;  };  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1396,7 +1422,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \  	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\  } +/* Add these declarations here only so that these functions can be + * found by name.  Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit);  EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name.  Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit);  #if (BITS_PER_LONG < 64)  EXT4_INODE_BIT_FNS(state, state_flags, 0) @@ -1470,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)  #define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008  #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010  #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200  #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001  #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002 @@ -1918,10 +1956,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,  extern ext4_group_t ext4_get_group_number(struct super_block *sb,  					  ext4_fsblk_t block); -extern void ext4_validate_block_bitmap(struct super_block *sb, -				       struct ext4_group_desc *desc, -				       ext4_group_t block_group, -				       struct buffer_head *bh);  extern unsigned int ext4_block_group(struct super_block *sb,  			ext4_fsblk_t blocknr);  extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, @@ -1950,16 +1984,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb,  				  struct buffer_head *bh);  extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,  						  ext4_group_t block_group); -extern void ext4_init_block_bitmap(struct super_block *sb, -				   struct buffer_head *bh, -				   ext4_group_t group, -				   struct ext4_group_desc *desc);  extern unsigned ext4_free_clusters_after_init(struct super_block *sb,  					      ext4_group_t block_group,  					      struct ext4_group_desc *gdp); -extern unsigned ext4_num_overhead_clusters(struct super_block *sb, -					   ext4_group_t block_group, -					   struct ext4_group_desc *gdp);  ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);  /* dir.c */ @@ -2102,10 +2129,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode);  extern void ext4_set_aops(struct inode *inode);  extern int ext4_writepage_trans_blocks(struct inode *);  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, -		struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, -		struct address_space *mapping, loff_t from, loff_t length);  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,  			     loff_t lstart, loff_t lend);  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -2117,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode,  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,  				struct ext4_map_blocks *map, int flags);  extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, -				const struct iovec *iov, loff_t offset, -				unsigned long nr_segs); +				struct iov_iter *iter, loff_t offset);  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);  extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -2165,8 +2187,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);  /* super.c */  extern int ext4_calculate_overhead(struct super_block *sb); -extern int ext4_superblock_csum_verify(struct super_block *sb, -				       struct ext4_super_block *es);  extern void ext4_superblock_csum_set(struct super_block *sb);  extern void *ext4_kvmalloc(size_t size, gfp_t flags);  extern void *ext4_kvzalloc(size_t size, gfp_t flags); @@ -2433,23 +2453,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)  	up_write(&EXT4_I(inode)->i_data_sem);  } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ -	loff_t i_size; - -	down_write(&EXT4_I(inode)->i_data_sem); -	i_size = i_size_read(inode); -	if (newsize > i_size) -		newsize = i_size; -	if (newsize > EXT4_I(inode)->i_disksize) -		EXT4_I(inode)->i_disksize = newsize; -	up_write(&EXT4_I(inode)->i_data_sem); -} -  struct ext4_group_info {  	unsigned long   bb_state;  	struct rb_root  bb_free_root; @@ -2555,19 +2558,11 @@ extern const struct file_operations ext4_dir_operations;  extern const struct inode_operations ext4_file_inode_operations;  extern const struct file_operations ext4_file_operations;  extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); -extern void ext4_unwritten_wait(struct inode *inode);  /* inline.c */  extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode);  extern int ext4_get_max_inline_size(struct inode *inode);  extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, -				   struct ext4_iloc *iloc, -				   void *buffer, loff_t pos, -				   unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, -				    unsigned int len);  extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,  				 unsigned int len);  extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); @@ -2728,17 +2723,18 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);  extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  			__u64 start, __u64 len);  extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);  /* move_extent.c */  extern void ext4_double_down_write_data_sem(struct inode *first,  					    struct inode *second);  extern void ext4_double_up_write_data_sem(struct inode *orig_inode,  					  struct inode *donor_inode); -void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); -void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);  extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,  			     __u64 start_orig, __u64 start_donor,  			     __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, +			    struct ext4_extent **extent);  /* page-io.c */  extern int __init ext4_init_pageio(void); @@ -2754,23 +2750,20 @@ extern void ext4_io_submit(struct ext4_io_submit *io);  extern int ext4_bio_write_page(struct ext4_io_submit *io,  			       struct page *page,  			       int len, -			       struct writeback_control *wbc); +			       struct writeback_control *wbc, +			       bool keep_towrite);  /* mmp.c */  extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); -extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); -extern int ext4_mmp_csum_verify(struct super_block *sb, -				struct mmp_struct *mmp);  /*   * Note that these flags will never ever appear in a buffer_head's state flag.   * See EXT4_MAP_... to see where this is used.   */  enum ext4_state_bits { -	BH_Uninit	/* blocks are allocated but uninitialized on disk */ -	 = BH_JBDPrivateStart, -	BH_AllocFromCluster,	/* allocated blocks were part of already +	BH_AllocFromCluster	/* allocated blocks were part of already  				 * allocated cluster. */ +	= BH_JBDPrivateStart  };  /* diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 5074fe23f19..a867f5ca999 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -137,21 +137,21 @@ struct ext4_ext_path {   * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an   * initialized extent. This is 2^15 and not (2^16 - 1), since we use the   * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. + * particular extent is an initialized extent or an unwritten (i.e.   * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent.   * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and   * thus we make it as a special case of initialized extent with 0x8000 length.   * This way we get better extent-to-group alignment for initialized extents.   * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).   */  #define EXT_INIT_MAX_LEN	(1UL << 15) -#define EXT_UNINIT_MAX_LEN	(EXT_INIT_MAX_LEN - 1) +#define EXT_UNWRITTEN_MAX_LEN	(EXT_INIT_MAX_LEN - 1)  #define EXT_FIRST_EXTENT(__hdr__) \ @@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode)  	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);  } -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)  { -	/* We can not have an uninitialized extent of zero length! */ +	/* We can not have an unwritten extent of zero length! */  	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);  	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);  } -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)  {  	/* Extent with ee_len of 0x8000 is treated as an initialized extent */  	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 17ac112ab10..0074e0d23d6 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -122,9 +122,10 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,  	return handle;  } -void ext4_journal_abort_handle(const char *caller, unsigned int line, -			       const char *err_fn, struct buffer_head *bh, -			       handle_t *handle, int err) +static void ext4_journal_abort_handle(const char *caller, unsigned int line, +				      const char *err_fn, +				      struct buffer_head *bh, +				      handle_t *handle, int err)  {  	char nbuf[16];  	const char *errstr = ext4_decode_error(NULL, err, nbuf); @@ -259,6 +260,25 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,  		if (WARN_ON_ONCE(err)) {  			ext4_journal_abort_handle(where, line, __func__, bh,  						  handle, err); +			if (inode == NULL) { +				pr_err("EXT4: jbd2_journal_dirty_metadata " +				       "failed: handle type %u started at " +				       "line %u, credits %u/%u, errcode %d", +				       handle->h_type, +				       handle->h_line_no, +				       handle->h_requested_credits, +				       handle->h_buffer_credits, err); +				return err; +			} +			ext4_error_inode(inode, where, line, +					 bh->b_blocknr, +					 "journal_dirty_metadata failed: " +					 "handle type %u started at line %u, " +					 "credits %u/%u, errcode %d", +					 handle->h_type, +					 handle->h_line_no, +					 handle->h_requested_credits, +					 handle->h_buffer_credits, err);  		}  	} else {  		if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 81cfefa9dc0..17c00ff202f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);  /*   * Wrapper functions with which ext4 calls into JBD.   */ -void ext4_journal_abort_handle(const char *caller, unsigned int line, -			       const char *err_fn, -		struct buffer_head *bh, handle_t *handle, int err); -  int __ext4_journal_get_write_access(const char *where, unsigned int line,  				    handle_t *handle, struct buffer_head *bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54d52afcdb1..4da228a0e6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@  #include <linux/quotaops.h>  #include <linux/string.h>  #include <linux/slab.h> -#include <linux/falloc.h>  #include <asm/uaccess.h>  #include <linux/fiemap.h>  #include "ext4_jbd2.h" @@ -51,8 +50,8 @@   */  #define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \  					due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */  #define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */  #define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */ @@ -144,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,  {  	if (path->p_bh) {  		/* path points to block */ +		BUFFER_TRACE(path->p_bh, "get_write_access");  		return ext4_journal_get_write_access(handle, path->p_bh);  	}  	/* path points to leaf/index in inode body */ @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)  {  	ext4_fsblk_t block = ext4_ext_pblock(ext);  	int len = ext4_ext_get_actual_len(ext); +	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); +	ext4_lblk_t last = lblock + len - 1; -	if (len == 0) +	if (lblock > last)  		return 0;  	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);  } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,  	if (depth == 0) {  		/* leaf entries */  		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); +		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; +		ext4_fsblk_t pblock = 0; +		ext4_lblk_t lblock = 0; +		ext4_lblk_t prev = 0; +		int len = 0;  		while (entries) {  			if (!ext4_valid_extent(inode, ext))  				return 0; + +			/* Check for overlapping extents */ +			lblock = le32_to_cpu(ext->ee_block); +			len = ext4_ext_get_actual_len(ext); +			if ((lblock <= prev) && prev) { +				pblock = ext4_ext_pblock(ext); +				es->s_last_error_block = cpu_to_le64(pblock); +				return 0; +			}  			ext++;  			entries--; +			prev = lblock + len - 1;  		}  	} else {  		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -508,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line,  						     lblk - prev, ~0,  						     EXTENT_STATUS_HOLE); -			if (ext4_ext_is_uninitialized(ex)) +			if (ext4_ext_is_unwritten(ex))  				status = EXTENT_STATUS_UNWRITTEN;  			ext4_es_cache_extent(inode, lblk, len,  					     ext4_ext_pblock(ex), status); @@ -604,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)  		} else if (path->p_ext) {  			ext_debug("  %d:[%d]%d:%llu ",  				  le32_to_cpu(path->p_ext->ee_block), -				  ext4_ext_is_uninitialized(path->p_ext), +				  ext4_ext_is_unwritten(path->p_ext),  				  ext4_ext_get_actual_len(path->p_ext),  				  ext4_ext_pblock(path->p_ext));  		} else @@ -630,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)  	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {  		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), -			  ext4_ext_is_uninitialized(ex), +			  ext4_ext_is_unwritten(ex),  			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));  	}  	ext_debug("\n"); @@ -661,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,  		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",  				le32_to_cpu(ex->ee_block),  				ext4_ext_pblock(ex), -				ext4_ext_is_uninitialized(ex), +				ext4_ext_is_unwritten(ex),  				ext4_ext_get_actual_len(ex),  				newblock);  		ex++; @@ -786,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode,  	ext_debug("  -> %d:%llu:[%d]%d ",  			le32_to_cpu(path->p_ext->ee_block),  			ext4_ext_pblock(path->p_ext), -			ext4_ext_is_uninitialized(path->p_ext), +			ext4_ext_is_unwritten(path->p_ext),  			ext4_ext_get_actual_len(path->p_ext));  #ifdef CHECK_BINSEARCH @@ -1666,22 +1683,17 @@ int  ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  				struct ext4_extent *ex2)  { -	unsigned short ext1_ee_len, ext2_ee_len, max_len; +	unsigned short ext1_ee_len, ext2_ee_len;  	/*  	 * Make sure that both extents are initialized. We don't merge -	 * uninitialized extents so that we can be sure that end_io code has +	 * unwritten extents so that we can be sure that end_io code has  	 * the extent that was written properly split out and conversion to  	 * initialized is trivial.  	 */ -	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) +	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))  		return 0; -	if (ext4_ext_is_uninitialized(ex1)) -		max_len = EXT_UNINIT_MAX_LEN; -	else -		max_len = EXT_INIT_MAX_LEN; -  	ext1_ee_len = ext4_ext_get_actual_len(ex1);  	ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1694,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,  	 * as an RO_COMPAT feature, refuse to merge to extents if  	 * this can result in the top bit of ee_len being set.  	 */ -	if (ext1_ee_len + ext2_ee_len > max_len) +	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) +		return 0; +	if (ext4_ext_is_unwritten(ex1) && +	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || +	     atomic_read(&EXT4_I(inode)->i_unwritten) || +	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))  		return 0;  #ifdef AGGRESSIVE_TEST  	if (ext1_ee_len >= 4) @@ -1719,8 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,  {  	struct ext4_extent_header *eh;  	unsigned int depth, len; -	int merge_done = 0; -	int uninitialized = 0; +	int merge_done = 0, unwritten;  	depth = ext_depth(inode);  	BUG_ON(path[depth].p_hdr == NULL); @@ -1730,12 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,  		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))  			break;  		/* merge with next extent! */ -		if (ext4_ext_is_uninitialized(ex)) -			uninitialized = 1; +		unwritten = ext4_ext_is_unwritten(ex);  		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)  				+ ext4_ext_get_actual_len(ex + 1)); -		if (uninitialized) -			ext4_ext_mark_uninitialized(ex); +		if (unwritten) +			ext4_ext_mark_unwritten(ex);  		if (ex + 1 < EXT_LAST_EXTENT(eh)) {  			len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1844,8 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,  	depth = ext_depth(inode);  	if (!path[depth].p_ext)  		goto out; -	b2 = le32_to_cpu(path[depth].p_ext->ee_block); -	b2 &= ~(sbi->s_cluster_ratio - 1); +	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));  	/*  	 * get the next allocated block if the extent in the path @@ -1855,7 +1869,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,  		b2 = ext4_ext_next_allocated_block(path);  		if (b2 == EXT_MAX_BLOCKS)  			goto out; -		b2 &= ~(sbi->s_cluster_ratio - 1); +		b2 = EXT4_LBLK_CMASK(sbi, b2);  	}  	/* check for wrap through zero on extent logical start block*/ @@ -1890,8 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,  	struct ext4_ext_path *npath = NULL;  	int depth, len, err;  	ext4_lblk_t next; -	unsigned uninitialized = 0; -	int mb_flags = 0; +	int mb_flags = 0, unwritten;  	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {  		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1931,29 +1944,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,  		if (ext4_can_extents_be_merged(inode, ex, newext)) {  			ext_debug("append [%d]%d block to %u:[%d]%d"  				  "(from %llu)\n", -				  ext4_ext_is_uninitialized(newext), +				  ext4_ext_is_unwritten(newext),  				  ext4_ext_get_actual_len(newext),  				  le32_to_cpu(ex->ee_block), -				  ext4_ext_is_uninitialized(ex), +				  ext4_ext_is_unwritten(ex),  				  ext4_ext_get_actual_len(ex),  				  ext4_ext_pblock(ex));  			err = ext4_ext_get_access(handle, inode,  						  path + depth);  			if (err)  				return err; - -			/* -			 * ext4_can_extents_be_merged should have checked -			 * that either both extents are uninitialized, or -			 * both aren't. Thus we need to check only one of -			 * them here. -			 */ -			if (ext4_ext_is_uninitialized(ex)) -				uninitialized = 1; +			unwritten = ext4_ext_is_unwritten(ex);  			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)  					+ ext4_ext_get_actual_len(newext)); -			if (uninitialized) -				ext4_ext_mark_uninitialized(ex); +			if (unwritten) +				ext4_ext_mark_unwritten(ex);  			eh = path[depth].p_hdr;  			nearex = ex;  			goto merge; @@ -1965,10 +1970,10 @@ prepend:  			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"  				  "(from %llu)\n",  				  le32_to_cpu(newext->ee_block), -				  ext4_ext_is_uninitialized(newext), +				  ext4_ext_is_unwritten(newext),  				  ext4_ext_get_actual_len(newext),  				  le32_to_cpu(ex->ee_block), -				  ext4_ext_is_uninitialized(ex), +				  ext4_ext_is_unwritten(ex),  				  ext4_ext_get_actual_len(ex),  				  ext4_ext_pblock(ex));  			err = ext4_ext_get_access(handle, inode, @@ -1976,20 +1981,13 @@ prepend:  			if (err)  				return err; -			/* -			 * ext4_can_extents_be_merged should have checked -			 * that either both extents are uninitialized, or -			 * both aren't. Thus we need to check only one of -			 * them here. -			 */ -			if (ext4_ext_is_uninitialized(ex)) -				uninitialized = 1; +			unwritten = ext4_ext_is_unwritten(ex);  			ex->ee_block = newext->ee_block;  			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));  			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)  					+ ext4_ext_get_actual_len(newext)); -			if (uninitialized) -				ext4_ext_mark_uninitialized(ex); +			if (unwritten) +				ext4_ext_mark_unwritten(ex);  			eh = path[depth].p_hdr;  			nearex = ex;  			goto merge; @@ -2049,7 +2047,7 @@ has_space:  		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",  				le32_to_cpu(newext->ee_block),  				ext4_ext_pblock(newext), -				ext4_ext_is_uninitialized(newext), +				ext4_ext_is_unwritten(newext),  				ext4_ext_get_actual_len(newext));  		nearex = EXT_FIRST_EXTENT(eh);  	} else { @@ -2060,7 +2058,7 @@ has_space:  					"nearest %p\n",  					le32_to_cpu(newext->ee_block),  					ext4_ext_pblock(newext), -					ext4_ext_is_uninitialized(newext), +					ext4_ext_is_unwritten(newext),  					ext4_ext_get_actual_len(newext),  					nearex);  			nearex++; @@ -2071,7 +2069,7 @@ has_space:  					"nearest %p\n",  					le32_to_cpu(newext->ee_block),  					ext4_ext_pblock(newext), -					ext4_ext_is_uninitialized(newext), +					ext4_ext_is_unwritten(newext),  					ext4_ext_get_actual_len(newext),  					nearex);  		} @@ -2081,7 +2079,7 @@ has_space:  					"move %d extents from 0x%p to 0x%p\n",  					le32_to_cpu(newext->ee_block),  					ext4_ext_pblock(newext), -					ext4_ext_is_uninitialized(newext), +					ext4_ext_is_unwritten(newext),  					ext4_ext_get_actual_len(newext),  					len, nearex, nearex + 1);  			memmove(nearex + 1, nearex, @@ -2203,7 +2201,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,  			es.es_lblk = le32_to_cpu(ex->ee_block);  			es.es_len = ext4_ext_get_actual_len(ex);  			es.es_pblk = ext4_ext_pblock(ex); -			if (ext4_ext_is_uninitialized(ex)) +			if (ext4_ext_is_unwritten(ex))  				flags |= FIEMAP_EXTENT_UNWRITTEN;  		} @@ -2535,7 +2533,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,  		 * extent, we have to mark the cluster as used (store negative  		 * cluster number in partial_cluster).  		 */ -		unaligned = pblk & (sbi->s_cluster_ratio - 1); +		unaligned = EXT4_PBLK_COFF(sbi, pblk);  		if (unaligned && (ee_len == num) &&  		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))  			*partial_cluster = EXT4_B2C(sbi, pblk); @@ -2579,7 +2577,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	unsigned num;  	ext4_lblk_t ex_ee_block;  	unsigned short ex_ee_len; -	unsigned uninitialized = 0; +	unsigned unwritten = 0;  	struct ext4_extent *ex;  	ext4_fsblk_t pblk; @@ -2600,18 +2598,39 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	ex_ee_block = le32_to_cpu(ex->ee_block);  	ex_ee_len = ext4_ext_get_actual_len(ex); +	/* +	 * If we're starting with an extent other than the last one in the +	 * node, we need to see if it shares a cluster with the extent to +	 * the right (towards the end of the file). If its leftmost cluster +	 * is this extent's rightmost cluster and it is not cluster aligned, +	 * we'll mark it as a partial that is not to be deallocated. +	 */ + +	if (ex != EXT_LAST_EXTENT(eh)) { +		ext4_fsblk_t current_pblk, right_pblk; +		long long current_cluster, right_cluster; + +		current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; +		current_cluster = (long long)EXT4_B2C(sbi, current_pblk); +		right_pblk = ext4_ext_pblock(ex + 1); +		right_cluster = (long long)EXT4_B2C(sbi, right_pblk); +		if (current_cluster == right_cluster && +			EXT4_PBLK_COFF(sbi, right_pblk)) +			*partial_cluster = -right_cluster; +	} +  	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);  	while (ex >= EXT_FIRST_EXTENT(eh) &&  			ex_ee_block + ex_ee_len > start) { -		if (ext4_ext_is_uninitialized(ex)) -			uninitialized = 1; +		if (ext4_ext_is_unwritten(ex)) +			unwritten = 1;  		else -			uninitialized = 0; +			unwritten = 0;  		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, -			 uninitialized, ex_ee_len); +			  unwritten, ex_ee_len);  		path[depth].p_ext = ex;  		a = ex_ee_block > start ? ex_ee_block : start; @@ -2629,7 +2648,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  			 * accidentally freeing it later on  			 */  			pblk = ext4_ext_pblock(ex); -			if (pblk & (sbi->s_cluster_ratio - 1)) +			if (EXT4_PBLK_COFF(sbi, pblk))  				*partial_cluster =  					-((long long)EXT4_B2C(sbi, pblk));  			ex--; @@ -2683,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		ex->ee_len = cpu_to_le16(num);  		/* -		 * Do not mark uninitialized if all the blocks in the +		 * Do not mark unwritten if all the blocks in the  		 * extent have been removed.  		 */ -		if (uninitialized && num) -			ext4_ext_mark_uninitialized(ex); +		if (unwritten && num) +			ext4_ext_mark_unwritten(ex);  		/*  		 * If the extent was completely released,  		 * we need to remove it from the leaf @@ -2725,10 +2744,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		err = ext4_ext_correct_indexes(handle, inode, path);  	/* -	 * Free the partial cluster only if the current extent does not -	 * reference it. Otherwise we might free used cluster. +	 * If there's a partial cluster and at least one extent remains in +	 * the leaf, free the partial cluster if it isn't shared with the +	 * current extent.  If there's a partial cluster and no extents +	 * remain in the leaf, it can't be freed here.  It can only be +	 * freed when it's possible to determine if it's not shared with +	 * any other extent - when the next leaf is processed or when space +	 * removal is complete.  	 */ -	if (*partial_cluster > 0 && +	if (*partial_cluster > 0 && eh->eh_entries &&  	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=  	     *partial_cluster)) {  		int flags = get_default_free_blocks_flags(inode); @@ -2831,9 +2855,9 @@ again:  		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {  			int split_flag = 0; -			if (ext4_ext_is_uninitialized(ex)) -				split_flag = EXT4_EXT_MARK_UNINIT1 | -					     EXT4_EXT_MARK_UNINIT2; +			if (ext4_ext_is_unwritten(ex)) +				split_flag = EXT4_EXT_MARK_UNWRIT1 | +					     EXT4_EXT_MARK_UNWRIT2;  			/*  			 * Split the extent in two so that 'end' is the last @@ -3090,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)   * @path: the path to the extent   * @split: the logical block where the extent is splitted.   * @split_flags: indicates if the extent could be zeroout if split fails, and - *		 the states(init or uninit) of new extents. + *		 the states(init or unwritten) of new extents.   * @flags: flags used to insert new extent to extent tree.   *   * @@ -3132,10 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle,  	newblock = split - ee_block + ext4_ext_pblock(ex);  	BUG_ON(split < ee_block || split >= (ee_block + ee_len)); -	BUG_ON(!ext4_ext_is_uninitialized(ex) && +	BUG_ON(!ext4_ext_is_unwritten(ex) &&  	       split_flag & (EXT4_EXT_MAY_ZEROOUT | -			     EXT4_EXT_MARK_UNINIT1 | -			     EXT4_EXT_MARK_UNINIT2)); +			     EXT4_EXT_MARK_UNWRIT1 | +			     EXT4_EXT_MARK_UNWRIT2));  	err = ext4_ext_get_access(handle, inode, path + depth);  	if (err) @@ -3147,8 +3171,8 @@ static int ext4_split_extent_at(handle_t *handle,  		 * then we just change the state of the extent, and splitting  		 * is not needed.  		 */ -		if (split_flag & EXT4_EXT_MARK_UNINIT2) -			ext4_ext_mark_uninitialized(ex); +		if (split_flag & EXT4_EXT_MARK_UNWRIT2) +			ext4_ext_mark_unwritten(ex);  		else  			ext4_ext_mark_initialized(ex); @@ -3162,8 +3186,8 @@ static int ext4_split_extent_at(handle_t *handle,  	/* case a */  	memcpy(&orig_ex, ex, sizeof(orig_ex));  	ex->ee_len = cpu_to_le16(split - ee_block); -	if (split_flag & EXT4_EXT_MARK_UNINIT1) -		ext4_ext_mark_uninitialized(ex); +	if (split_flag & EXT4_EXT_MARK_UNWRIT1) +		ext4_ext_mark_unwritten(ex);  	/*  	 * path may lead to new leaf, not to original leaf any more @@ -3177,8 +3201,8 @@ static int ext4_split_extent_at(handle_t *handle,  	ex2->ee_block = cpu_to_le32(split);  	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));  	ext4_ext_store_pblock(ex2, newblock); -	if (split_flag & EXT4_EXT_MARK_UNINIT2) -		ext4_ext_mark_uninitialized(ex2); +	if (split_flag & EXT4_EXT_MARK_UNWRIT2) +		ext4_ext_mark_unwritten(ex2);  	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);  	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { @@ -3255,7 +3279,7 @@ static int ext4_split_extent(handle_t *handle,  	struct ext4_extent *ex;  	unsigned int ee_len, depth;  	int err = 0; -	int uninitialized; +	int unwritten;  	int split_flag1, flags1;  	int allocated = map->m_len; @@ -3263,14 +3287,14 @@ static int ext4_split_extent(handle_t *handle,  	ex = path[depth].p_ext;  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex); -	uninitialized = ext4_ext_is_uninitialized(ex); +	unwritten = ext4_ext_is_unwritten(ex);  	if (map->m_lblk + map->m_len < ee_block + ee_len) {  		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;  		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; -		if (uninitialized) -			split_flag1 |= EXT4_EXT_MARK_UNINIT1 | -				       EXT4_EXT_MARK_UNINIT2; +		if (unwritten) +			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | +				       EXT4_EXT_MARK_UNWRIT2;  		if (split_flag & EXT4_EXT_DATA_VALID2)  			split_flag1 |= EXT4_EXT_DATA_VALID1;  		err = ext4_split_extent_at(handle, inode, path, @@ -3290,15 +3314,20 @@ static int ext4_split_extent(handle_t *handle,  		return PTR_ERR(path);  	depth = ext_depth(inode);  	ex = path[depth].p_ext; -	uninitialized = ext4_ext_is_uninitialized(ex); +	if (!ex) { +		EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +				 (unsigned long) map->m_lblk); +		return -EIO; +	} +	unwritten = ext4_ext_is_unwritten(ex);  	split_flag1 = 0;  	if (map->m_lblk >= ee_block) {  		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; -		if (uninitialized) { -			split_flag1 |= EXT4_EXT_MARK_UNINIT1; +		if (unwritten) { +			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;  			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | -						     EXT4_EXT_MARK_UNINIT2); +						     EXT4_EXT_MARK_UNWRIT2);  		}  		err = ext4_split_extent_at(handle, inode, path,  				map->m_lblk, split_flag1, flags); @@ -3313,16 +3342,16 @@ out:  /*   * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten   * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten).   * There are three possibilities:   *   a> There is no split required: Entire extent should be initialized   *   b> Splits in two extents: Write is happening at either end of the extent   *   c> Splits in three extents: Somone is writing in middle of the extent   *   * Pre-conditions: - *  - The extent pointed to by 'path' is uninitialized. + *  - The extent pointed to by 'path' is unwritten.   *  - The extent pointed to by 'path' contains a superset   *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).   * @@ -3368,12 +3397,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);  	/* Pre-conditions */ -	BUG_ON(!ext4_ext_is_uninitialized(ex)); +	BUG_ON(!ext4_ext_is_unwritten(ex));  	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));  	/*  	 * Attempt to transfer newly initialized blocks from the currently -	 * uninitialized extent to its neighbor. This is much cheaper +	 * unwritten extent to its neighbor. This is much cheaper  	 * than an insertion followed by a merge as those involve costly  	 * memmove() calls. Transferring to the left is the common case in  	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3409,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  		 * - C4: abut_ex can receive the additional blocks without  		 *   overflowing the (initialized) length limit.  		 */ -		if ((!ext4_ext_is_uninitialized(abut_ex)) &&		/*C1*/ +		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/  			((prev_lblk + prev_len) == ee_block) &&		/*C2*/  			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/  			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/ @@ -3424,7 +3453,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  			ex->ee_block = cpu_to_le32(ee_block + map_len);  			ext4_ext_store_pblock(ex, ee_pblk + map_len);  			ex->ee_len = cpu_to_le16(ee_len - map_len); -			ext4_ext_mark_uninitialized(ex); /* Restore the flag */ +			ext4_ext_mark_unwritten(ex); /* Restore the flag */  			/* Extend abut_ex by 'map_len' blocks */  			abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3455,7 +3484,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  		 * - C4: abut_ex can receive the additional blocks without  		 *   overflowing the (initialized) length limit.  		 */ -		if ((!ext4_ext_is_uninitialized(abut_ex)) &&		/*C1*/ +		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/  		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/  		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/  		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/ @@ -3470,7 +3499,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);  			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);  			ex->ee_len = cpu_to_le16(ee_len - map_len); -			ext4_ext_mark_uninitialized(ex); /* Restore the flag */ +			ext4_ext_mark_unwritten(ex); /* Restore the flag */  			/* Extend abut_ex by 'map_len' blocks */  			abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3492,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,  	WARN_ON(map->m_lblk < ee_block);  	/*  	 * It is safe to convert extent to initialized via explicit -	 * zeroout only if extent is fully insde i_size or new_size. +	 * zeroout only if extent is fully inside i_size or new_size.  	 */  	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3575,26 +3604,28 @@ out:  /*   * This function is called by ext4_ext_map_blocks() from   * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent.   * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three)   * There are three possibilities: - *   a> There is no split required: Entire extent should be uninitialized + *   a> There is no split required: Entire extent should be unwritten   *   b> Splits in two extents: Write is happening at either end of the extent   *   c> Splits in three extents: Somone is writing in middle of the extent   * + * This works the same way in the case of initialized -> unwritten conversion. + *   * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part   * being filled will be convert to initialized by the end_io callback function   * via ext4_convert_unwritten_extents().   * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success.   */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle,  					struct inode *inode,  					struct ext4_map_blocks *map,  					struct ext4_ext_path *path, @@ -3606,9 +3637,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,  	unsigned int ee_len;  	int split_flag = 0, depth; -	ext_debug("ext4_split_unwritten_extents: inode %lu, logical" -		"block %llu, max_blocks %u\n", inode->i_ino, -		(unsigned long long)map->m_lblk, map->m_len); +	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", +		  __func__, inode->i_ino, +		  (unsigned long long)map->m_lblk, map->m_len);  	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>  		inode->i_sb->s_blocksize_bits; @@ -3623,14 +3654,79 @@ static int ext4_split_unwritten_extents(handle_t *handle,  	ee_block = le32_to_cpu(ex->ee_block);  	ee_len = ext4_ext_get_actual_len(ex); -	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; -	split_flag |= EXT4_EXT_MARK_UNINIT2; -	if (flags & EXT4_GET_BLOCKS_CONVERT) -		split_flag |= EXT4_EXT_DATA_VALID2; +	/* Convert to unwritten */ +	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { +		split_flag |= EXT4_EXT_DATA_VALID1; +	/* Convert to initialized */ +	} else if (flags & EXT4_GET_BLOCKS_CONVERT) { +		split_flag |= ee_block + ee_len <= eof_block ? +			      EXT4_EXT_MAY_ZEROOUT : 0; +		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); +	}  	flags |= EXT4_GET_BLOCKS_PRE_IO;  	return ext4_split_extent(handle, inode, path, map, split_flag, flags);  } +static int ext4_convert_initialized_extents(handle_t *handle, +					    struct inode *inode, +					    struct ext4_map_blocks *map, +					    struct ext4_ext_path *path) +{ +	struct ext4_extent *ex; +	ext4_lblk_t ee_block; +	unsigned int ee_len; +	int depth; +	int err = 0; + +	depth = ext_depth(inode); +	ex = path[depth].p_ext; +	ee_block = le32_to_cpu(ex->ee_block); +	ee_len = ext4_ext_get_actual_len(ex); + +	ext_debug("%s: inode %lu, logical" +		"block %llu, max_blocks %u\n", __func__, inode->i_ino, +		  (unsigned long long)ee_block, ee_len); + +	if (ee_block != map->m_lblk || ee_len > map->m_len) { +		err = ext4_split_convert_extents(handle, inode, map, path, +				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); +		if (err < 0) +			goto out; +		ext4_ext_drop_refs(path); +		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); +		if (IS_ERR(path)) { +			err = PTR_ERR(path); +			goto out; +		} +		depth = ext_depth(inode); +		ex = path[depth].p_ext; +		if (!ex) { +			EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +					 (unsigned long) map->m_lblk); +			err = -EIO; +			goto out; +		} +	} + +	err = ext4_ext_get_access(handle, inode, path + depth); +	if (err) +		goto out; +	/* first mark the extent as unwritten */ +	ext4_ext_mark_unwritten(ex); + +	/* note: ext4_ext_correct_indexes() isn't needed here because +	 * borders are not changed +	 */ +	ext4_ext_try_to_merge(handle, inode, path, ex); + +	/* Mark modified extent as dirty */ +	err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: +	ext4_ext_show_leaf(inode, path); +	return err; +} + +  static int ext4_convert_unwritten_extents_endio(handle_t *handle,  						struct inode *inode,  						struct ext4_map_blocks *map, @@ -3664,8 +3760,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,  			     inode->i_ino, (unsigned long long)ee_block, ee_len,  			     (unsigned long long)map->m_lblk, map->m_len);  #endif -		err = ext4_split_unwritten_extents(handle, inode, map, path, -						   EXT4_GET_BLOCKS_CONVERT); +		err = ext4_split_convert_extents(handle, inode, map, path, +						 EXT4_GET_BLOCKS_CONVERT);  		if (err < 0)  			goto out;  		ext4_ext_drop_refs(path); @@ -3784,7 +3880,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)  {  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	ext4_lblk_t lblk_start, lblk_end; -	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); +	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);  	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;  	return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3843,9 +3939,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,  	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);  	/* Check towards left side */ -	c_offset = lblk_start & (sbi->s_cluster_ratio - 1); +	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);  	if (c_offset) { -		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); +		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);  		lblk_to = lblk_from + c_offset - 1;  		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3853,7 +3949,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,  	}  	/* Now check towards right. */ -	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); +	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);  	if (allocated_clusters && c_offset) {  		lblk_from = lblk_start + num_blks;  		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -3866,7 +3962,39 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,  }  static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, +			struct ext4_map_blocks *map, +			struct ext4_ext_path *path, int flags, +			unsigned int allocated, ext4_fsblk_t newblock) +{ +	int ret = 0; +	int err = 0; + +	/* +	 * Make sure that the extent is no bigger than we support with +	 * unwritten extent +	 */ +	if (map->m_len > EXT_UNWRITTEN_MAX_LEN) +		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + +	ret = ext4_convert_initialized_extents(handle, inode, map, +						path); +	if (ret >= 0) { +		ext4_update_inode_fsync_trans(handle, inode, 1); +		err = check_eofblocks_fl(handle, inode, map->m_lblk, +					 path, map->m_len); +	} else +		err = ret; +	map->m_flags |= EXT4_MAP_UNWRITTEN; +	if (allocated > map->m_len) +		allocated = map->m_len; +	map->m_len = allocated; + +	return err ? err : allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,  			struct ext4_map_blocks *map,  			struct ext4_ext_path *path, int flags,  			unsigned int allocated, ext4_fsblk_t newblock) @@ -3875,25 +4003,25 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  	int err = 0;  	ext4_io_end_t *io = ext4_inode_aio(inode); -	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " +	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "  		  "block %llu, max_blocks %u, flags %x, allocated %u\n",  		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,  		  flags, allocated);  	ext4_ext_show_leaf(inode, path);  	/* -	 * When writing into uninitialized space, we should not fail to +	 * When writing into unwritten space, we should not fail to  	 * allocate metadata blocks for the new extent block if needed.  	 */  	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; -	trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, +	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,  						    allocated, newblock);  	/* get_block() before submit the IO, split the extent */ -	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { -		ret = ext4_split_unwritten_extents(handle, inode, map, -						   path, flags); +	if (flags & EXT4_GET_BLOCKS_PRE_IO) { +		ret = ext4_split_convert_extents(handle, inode, map, +					 path, flags | EXT4_GET_BLOCKS_CONVERT);  		if (ret <= 0)  			goto out;  		/* @@ -3906,12 +4034,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  		else  			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);  		map->m_flags |= EXT4_MAP_UNWRITTEN; -		if (ext4_should_dioread_nolock(inode)) -			map->m_flags |= EXT4_MAP_UNINIT;  		goto out;  	}  	/* IO end_io complete, convert the filled extent to written */ -	if ((flags & EXT4_GET_BLOCKS_CONVERT)) { +	if (flags & EXT4_GET_BLOCKS_CONVERT) {  		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,  							path);  		if (ret >= 0) { @@ -3921,6 +4047,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  		} else  			err = ret;  		map->m_flags |= EXT4_MAP_MAPPED; +		map->m_pblk = newblock;  		if (allocated > map->m_len)  			allocated = map->m_len;  		map->m_len = allocated; @@ -3931,7 +4058,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,  	 * repeat fallocate creation request  	 * we already have an unwritten extent  	 */ -	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { +	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {  		map->m_flags |= EXT4_MAP_UNWRITTEN;  		goto map_out;  	} @@ -4007,10 +4134,6 @@ out1:  	map->m_pblk = newblock;  	map->m_len = allocated;  out2: -	if (path) { -		ext4_ext_drop_refs(path); -		kfree(path); -	}  	return err ? err : allocated;  } @@ -4061,7 +4184,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,  				     struct ext4_ext_path *path)  {  	struct ext4_sb_info *sbi = EXT4_SB(sb); -	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); +	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);  	ext4_lblk_t ex_cluster_start, ex_cluster_end;  	ext4_lblk_t rr_cluster_start;  	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4079,8 +4202,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,  	    (rr_cluster_start == ex_cluster_start)) {  		if (rr_cluster_start == ex_cluster_end)  			ee_start += ee_len - 1; -		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + -			c_offset; +		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;  		map->m_len = min(map->m_len,  				 (unsigned) sbi->s_cluster_ratio - c_offset);  		/* @@ -4143,7 +4265,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	struct ext4_extent newex, *ex, *ex2;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	ext4_fsblk_t newblock = 0; -	int free_on_err = 0, err = 0, depth; +	int free_on_err = 0, err = 0, depth, ret;  	unsigned int allocated = 0, offset = 0;  	unsigned int allocated_clusters = 0;  	struct ext4_allocation_request ar; @@ -4185,8 +4307,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);  		unsigned short ee_len; +  		/* -		 * Uninitialized extents are treated as holes, except that +		 * unwritten extents are treated as holes, except that  		 * we split out initialized portions during a write.  		 */  		ee_len = ext4_ext_get_actual_len(ex); @@ -4201,13 +4324,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,  				  ee_block, ee_len, newblock); -			if (!ext4_ext_is_uninitialized(ex)) +			/* +			 * If the extent is initialized check whether the +			 * caller wants to convert it to unwritten. +			 */ +			if ((!ext4_ext_is_unwritten(ex)) && +			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { +				allocated = ext4_ext_convert_initialized_extent( +						handle, inode, map, path, flags, +						allocated, newblock); +				goto out2; +			} else if (!ext4_ext_is_unwritten(ex))  				goto out; -			allocated = ext4_ext_handle_uninitialized_extents( +			ret = ext4_ext_handle_unwritten_extents(  				handle, inode, map, path, flags,  				allocated, newblock); -			goto out3; +			if (ret < 0) +				err = ret; +			else +				allocated = ret; +			goto out2;  		}  	} @@ -4234,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	 */  	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;  	newex.ee_block = cpu_to_le32(map->m_lblk); -	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); +	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);  	/*  	 * If we are doing bigalloc, check to see if the extent returned @@ -4272,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	/*  	 * See if request is beyond maximum number of blocks we can have in  	 * a single extent. For an initialized extent this limit is -	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is -	 * EXT_UNINIT_MAX_LEN. +	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is +	 * EXT_UNWRITTEN_MAX_LEN.  	 */  	if (map->m_len > EXT_INIT_MAX_LEN && -	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) +	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))  		map->m_len = EXT_INIT_MAX_LEN; -	else if (map->m_len > EXT_UNINIT_MAX_LEN && -		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) -		map->m_len = EXT_UNINIT_MAX_LEN; +	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && +		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) +		map->m_len = EXT_UNWRITTEN_MAX_LEN;  	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */  	newex.ee_len = cpu_to_le16(map->m_len); @@ -4302,7 +4439,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	 * needed so that future calls to get_implied_cluster_alloc()  	 * work correctly.  	 */ -	offset = map->m_lblk & (sbi->s_cluster_ratio - 1); +	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);  	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);  	ar.goal -= offset;  	ar.logical -= offset; @@ -4328,21 +4465,19 @@ got_allocated_blocks:  	/* try to insert new extent into found leaf and return */  	ext4_ext_store_pblock(&newex, newblock + offset);  	newex.ee_len = cpu_to_le16(ar.len); -	/* Mark uninitialized */ -	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ -		ext4_ext_mark_uninitialized(&newex); +	/* Mark unwritten */ +	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ +		ext4_ext_mark_unwritten(&newex);  		map->m_flags |= EXT4_MAP_UNWRITTEN;  		/*  		 * io_end structure was created for every IO write to an -		 * uninitialized extent. To avoid unnecessary conversion, +		 * unwritten extent. To avoid unnecessary conversion,  		 * here we flag the IO that really needs the conversion.  		 * For non asycn direct IO case, flag the inode state  		 * that we need to perform conversion when IO is done.  		 */ -		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) +		if (flags & EXT4_GET_BLOCKS_PRE_IO)  			set_unwritten = 1; -		if (ext4_should_dioread_nolock(inode)) -			map->m_flags |= EXT4_MAP_UNINIT;  	}  	err = 0; @@ -4469,9 +4604,9 @@ got_allocated_blocks:  	/*  	 * Cache the extent and update transaction to commit on fdatasync only -	 * when it is _not_ an uninitialized extent. +	 * when it is _not_ an unwritten extent.  	 */ -	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) +	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)  		ext4_update_inode_fsync_trans(handle, inode, 1);  	else  		ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4488,7 +4623,6 @@ out2:  		kfree(path);  	} -out3:  	trace_ext4_ext_map_blocks_exit(inode, flags, map,  				       err ? err : allocated);  	ext4_es_lru_add(inode); @@ -4529,34 +4663,210 @@ retry:  	ext4_std_error(inode->i_sb, err);  } -static void ext4_falloc_update_inode(struct inode *inode, -				int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, +				  ext4_lblk_t len, int flags, int mode) +{ +	struct inode *inode = file_inode(file); +	handle_t *handle; +	int ret = 0; +	int ret2 = 0; +	int retries = 0; +	struct ext4_map_blocks map; +	unsigned int credits; + +	map.m_lblk = offset; +	/* +	 * Don't normalize the request if it can fit in one extent so +	 * that it doesn't get unnecessarily split into multiple +	 * extents. +	 */ +	if (len <= EXT_UNWRITTEN_MAX_LEN) +		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + +	/* +	 * credits to insert 1 extent into extent tree +	 */ +	credits = ext4_chunk_trans_blocks(inode, len); + +retry: +	while (ret >= 0 && ret < len) { +		map.m_lblk = map.m_lblk + ret; +		map.m_len = len = len - ret; +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, +					    credits); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			break; +		} +		ret = ext4_map_blocks(handle, inode, &map, flags); +		if (ret <= 0) { +			ext4_debug("inode #%lu: block %u: len %u: " +				   "ext4_ext_map_blocks returned %d", +				   inode->i_ino, map.m_lblk, +				   map.m_len, ret); +			ext4_mark_inode_dirty(handle, inode); +			ret2 = ext4_journal_stop(handle); +			break; +		} +		ret2 = ext4_journal_stop(handle); +		if (ret2) +			break; +	} +	if (ret == -ENOSPC && +			ext4_should_retry_alloc(inode->i_sb, &retries)) { +		ret = 0; +		goto retry; +	} + +	return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, +			    loff_t len, int mode)  { -	struct timespec now; +	struct inode *inode = file_inode(file); +	handle_t *handle = NULL; +	unsigned int max_blocks; +	loff_t new_size = 0; +	int ret = 0; +	int flags; +	int partial; +	loff_t start, end; +	ext4_lblk_t lblk; +	struct address_space *mapping = inode->i_mapping; +	unsigned int blkbits = inode->i_blkbits; + +	trace_ext4_zero_range(inode, offset, len, mode); + +	if (!S_ISREG(inode->i_mode)) +		return -EINVAL; -	if (update_ctime) { -		now = current_fs_time(inode->i_sb); -		if (!timespec_equal(&inode->i_ctime, &now)) -			inode->i_ctime = now; +	/* Call ext4_force_commit to flush all data in case of data=journal. */ +	if (ext4_should_journal_data(inode)) { +		ret = ext4_force_commit(inode->i_sb); +		if (ret) +			return ret;  	} +  	/* -	 * Update only when preallocation was requested beyond -	 * the file size. +	 * Write out all dirty pages to avoid race conditions +	 * Then release them.  	 */ -	if (!(mode & FALLOC_FL_KEEP_SIZE)) { +	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { +		ret = filemap_write_and_wait_range(mapping, offset, +						   offset + len - 1); +		if (ret) +			return ret; +	} + +	/* +	 * Round up offset. This is not fallocate, we neet to zero out +	 * blocks, so convert interior block aligned part of the range to +	 * unwritten and possibly manually zero out unaligned parts of the +	 * range. +	 */ +	start = round_up(offset, 1 << blkbits); +	end = round_down((offset + len), 1 << blkbits); + +	if (start < offset || end > offset + len) +		return -EINVAL; +	partial = (offset + len) & ((1 << blkbits) - 1); + +	lblk = start >> blkbits; +	max_blocks = (end >> blkbits); +	if (max_blocks < lblk) +		max_blocks = 0; +	else +		max_blocks -= lblk; + +	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | +		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; +	if (mode & FALLOC_FL_KEEP_SIZE) +		flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + +	mutex_lock(&inode->i_mutex); + +	/* +	 * Indirect files do not support unwritten extnets +	 */ +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { +		ret = -EOPNOTSUPP; +		goto out_mutex; +	} + +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +	     offset + len > i_size_read(inode)) { +		new_size = offset + len; +		ret = inode_newsize_ok(inode, new_size); +		if (ret) +			goto out_mutex; +		/* +		 * If we have a partial block after EOF we have to allocate +		 * the entire block. +		 */ +		if (partial) +			max_blocks += 1; +	} + +	if (max_blocks > 0) { + +		/* Now release the pages and zero block aligned part of pages*/ +		truncate_pagecache_range(inode, start, end - 1); + +		/* Wait all existing dio workers, newcomers will block on i_mutex */ +		ext4_inode_block_unlocked_dio(inode); +		inode_dio_wait(inode); + +		/* +		 * Remove entire range from the extent status tree. +		 */ +		ret = ext4_es_remove_extent(inode, lblk, max_blocks); +		if (ret) +			goto out_dio; + +		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, +					     mode); +		if (ret) +			goto out_dio; +	} + +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); +	if (IS_ERR(handle)) { +		ret = PTR_ERR(handle); +		ext4_std_error(inode->i_sb, ret); +		goto out_dio; +	} + +	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + +	if (new_size) {  		if (new_size > i_size_read(inode))  			i_size_write(inode, new_size);  		if (new_size > EXT4_I(inode)->i_disksize)  			ext4_update_i_disksize(inode, new_size);  	} else {  		/* -		 * Mark that we allocate beyond EOF so the subsequent truncate -		 * can proceed even if the new size is the same as i_size. -		 */ -		if (new_size > i_size_read(inode)) +		* Mark that we allocate beyond EOF so the subsequent truncate +		* can proceed even if the new size is the same as i_size. +		*/ +		if ((offset + len) > i_size_read(inode))  			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	} +	ext4_mark_inode_dirty(handle, inode); + +	/* Zero out partial block at the edges of the range */ +	ret = ext4_zero_partial_blocks(handle, inode, offset, len); + +	if (file->f_flags & O_SYNC) +		ext4_handle_sync(handle); + +	ext4_journal_stop(handle); +out_dio: +	ext4_inode_resume_unlocked_dio(inode); +out_mutex: +	mutex_unlock(&inode->i_mutex); +	return ret;  }  /* @@ -4570,17 +4880,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  {  	struct inode *inode = file_inode(file);  	handle_t *handle; -	loff_t new_size; +	loff_t new_size = 0;  	unsigned int max_blocks;  	int ret = 0; -	int ret2 = 0; -	int retries = 0;  	int flags; -	struct ext4_map_blocks map; -	unsigned int credits, blkbits = inode->i_blkbits; +	ext4_lblk_t lblk; +	struct timespec tv; +	unsigned int blkbits = inode->i_blkbits;  	/* Return error if mode is not supported */ -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | +		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))  		return -EOPNOTSUPP;  	if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4597,83 +4907,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))  		return -EOPNOTSUPP; +	if (mode & FALLOC_FL_COLLAPSE_RANGE) +		return ext4_collapse_range(inode, offset, len); + +	if (mode & FALLOC_FL_ZERO_RANGE) +		return ext4_zero_range(file, offset, len, mode); +  	trace_ext4_fallocate_enter(inode, offset, len, mode); -	map.m_lblk = offset >> blkbits; +	lblk = offset >> blkbits;  	/*  	 * We can't just convert len to max_blocks because  	 * If blocksize = 4096 offset = 3072 and len = 2048  	 */  	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -		- map.m_lblk; -	/* -	 * credits to insert 1 extent into extent tree -	 */ -	credits = ext4_chunk_trans_blocks(inode, max_blocks); -	mutex_lock(&inode->i_mutex); -	ret = inode_newsize_ok(inode, (len + offset)); -	if (ret) { -		mutex_unlock(&inode->i_mutex); -		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -		return ret; -	} -	flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; +		- lblk; + +	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;  	if (mode & FALLOC_FL_KEEP_SIZE)  		flags |= EXT4_GET_BLOCKS_KEEP_SIZE; -	/* -	 * Don't normalize the request if it can fit in one extent so -	 * that it doesn't get unnecessarily split into multiple -	 * extents. -	 */ -	if (len <= EXT_UNINIT_MAX_LEN << blkbits) -		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: -	while (ret >= 0 && ret < max_blocks) { -		map.m_lblk = map.m_lblk + ret; -		map.m_len = max_blocks = max_blocks - ret; -		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, -					    credits); -		if (IS_ERR(handle)) { -			ret = PTR_ERR(handle); -			break; -		} -		ret = ext4_map_blocks(handle, inode, &map, flags); -		if (ret <= 0) { -#ifdef EXT4FS_DEBUG -			ext4_warning(inode->i_sb, -				     "inode #%lu: block %u: len %u: " -				     "ext4_ext_map_blocks returned %d", -				     inode->i_ino, map.m_lblk, -				     map.m_len, ret); -#endif -			ext4_mark_inode_dirty(handle, inode); -			ret2 = ext4_journal_stop(handle); -			break; -		} -		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, -						blkbits) >> blkbits)) -			new_size = offset + len; -		else -			new_size = ((loff_t) map.m_lblk + ret) << blkbits; +	mutex_lock(&inode->i_mutex); -		ext4_falloc_update_inode(inode, mode, new_size, -					 (map.m_flags & EXT4_MAP_NEW)); -		ext4_mark_inode_dirty(handle, inode); -		if ((file->f_flags & O_SYNC) && ret >= max_blocks) -			ext4_handle_sync(handle); -		ret2 = ext4_journal_stop(handle); -		if (ret2) -			break; +	if (!(mode & FALLOC_FL_KEEP_SIZE) && +	     offset + len > i_size_read(inode)) { +		new_size = offset + len; +		ret = inode_newsize_ok(inode, new_size); +		if (ret) +			goto out;  	} -	if (ret == -ENOSPC && -			ext4_should_retry_alloc(inode->i_sb, &retries)) { -		ret = 0; -		goto retry; + +	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); +	if (ret) +		goto out; + +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); +	if (IS_ERR(handle)) +		goto out; + +	tv = inode->i_ctime = ext4_current_time(inode); + +	if (new_size) { +		if (new_size > i_size_read(inode)) { +			i_size_write(inode, new_size); +			inode->i_mtime = tv; +		} +		if (new_size > EXT4_I(inode)->i_disksize) +			ext4_update_i_disksize(inode, new_size); +	} else { +		/* +		* Mark that we allocate beyond EOF so the subsequent truncate +		* can proceed even if the new size is the same as i_size. +		*/ +		if ((offset + len) > i_size_read(inode)) +			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	} +	ext4_mark_inode_dirty(handle, inode); +	if (file->f_flags & O_SYNC) +		ext4_handle_sync(handle); + +	ext4_journal_stop(handle); +out:  	mutex_unlock(&inode->i_mutex); -	trace_ext4_fallocate_exit(inode, offset, max_blocks, -				ret > 0 ? ret2 : ret); -	return ret > 0 ? ret2 : ret; +	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); +	return ret;  }  /* @@ -4884,3 +5180,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  	ext4_es_lru_add(inode);  	return error;  } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, +		struct ext4_ext_path *path) +{ +	int credits, err; + +	if (!ext4_handle_valid(handle)) +		return 0; + +	/* +	 * Check if need to extend journal credits +	 * 3 for leaf, sb, and inode plus 2 (bmap and group +	 * descriptor) for each block group; assume two block +	 * groups +	 */ +	if (handle->h_buffer_credits < 7) { +		credits = ext4_writepage_trans_blocks(inode); +		err = ext4_ext_truncate_extend_restart(handle, inode, credits); +		/* EAGAIN is success */ +		if (err && err != -EAGAIN) +			return err; +	} + +	err = ext4_ext_get_access(handle, inode, path); +	return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, +			    struct inode *inode, handle_t *handle, +			    ext4_lblk_t *start) +{ +	int depth, err = 0; +	struct ext4_extent *ex_start, *ex_last; +	bool update = 0; +	depth = path->p_depth; + +	while (depth >= 0) { +		if (depth == path->p_depth) { +			ex_start = path[depth].p_ext; +			if (!ex_start) +				return -EIO; + +			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); +			if (!ex_last) +				return -EIO; + +			err = ext4_access_path(handle, inode, path + depth); +			if (err) +				goto out; + +			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) +				update = 1; + +			*start = le32_to_cpu(ex_last->ee_block) + +				ext4_ext_get_actual_len(ex_last); + +			while (ex_start <= ex_last) { +				le32_add_cpu(&ex_start->ee_block, -shift); +				/* Try to merge to the left. */ +				if ((ex_start > +				     EXT_FIRST_EXTENT(path[depth].p_hdr)) && +				    ext4_ext_try_to_merge_right(inode, +							path, ex_start - 1)) +					ex_last--; +				else +					ex_start++; +			} +			err = ext4_ext_dirty(handle, inode, path + depth); +			if (err) +				goto out; + +			if (--depth < 0 || !update) +				break; +		} + +		/* Update index too */ +		err = ext4_access_path(handle, inode, path + depth); +		if (err) +			goto out; + +		le32_add_cpu(&path[depth].p_idx->ei_block, -shift); +		err = ext4_ext_dirty(handle, inode, path + depth); +		if (err) +			goto out; + +		/* we are done if current index is not a starting index */ +		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) +			break; + +		depth--; +	} + +out: +	return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, +		       ext4_lblk_t start, ext4_lblk_t shift) +{ +	struct ext4_ext_path *path; +	int ret = 0, depth; +	struct ext4_extent *extent; +	ext4_lblk_t stop_block, current_block; +	ext4_lblk_t ex_start, ex_end; + +	/* Let path point to the last extent */ +	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); +	if (IS_ERR(path)) +		return PTR_ERR(path); + +	depth = path->p_depth; +	extent = path[depth].p_ext; +	if (!extent) { +		ext4_ext_drop_refs(path); +		kfree(path); +		return ret; +	} + +	stop_block = le32_to_cpu(extent->ee_block) + +			ext4_ext_get_actual_len(extent); +	ext4_ext_drop_refs(path); +	kfree(path); + +	/* Nothing to shift, if hole is at the end of file */ +	if (start >= stop_block) +		return ret; + +	/* +	 * Don't start shifting extents until we make sure the hole is big +	 * enough to accomodate the shift. +	 */ +	path = ext4_ext_find_extent(inode, start - 1, NULL, 0); +	if (IS_ERR(path)) +		return PTR_ERR(path); +	depth = path->p_depth; +	extent =  path[depth].p_ext; +	if (extent) { +		ex_start = le32_to_cpu(extent->ee_block); +		ex_end = le32_to_cpu(extent->ee_block) + +			ext4_ext_get_actual_len(extent); +	} else { +		ex_start = 0; +		ex_end = 0; +	} +	ext4_ext_drop_refs(path); +	kfree(path); + +	if ((start == ex_start && shift > ex_start) || +	    (shift > start - ex_end)) +		return -EINVAL; + +	/* Its safe to start updating extents */ +	while (start < stop_block) { +		path = ext4_ext_find_extent(inode, start, NULL, 0); +		if (IS_ERR(path)) +			return PTR_ERR(path); +		depth = path->p_depth; +		extent = path[depth].p_ext; +		if (!extent) { +			EXT4_ERROR_INODE(inode, "unexpected hole at %lu", +					 (unsigned long) start); +			return -EIO; +		} + +		current_block = le32_to_cpu(extent->ee_block); +		if (start > current_block) { +			/* Hole, move to the next extent */ +			ret = mext_next_extent(inode, path, &extent); +			if (ret != 0) { +				ext4_ext_drop_refs(path); +				kfree(path); +				if (ret == 1) +					ret = 0; +				break; +			} +		} +		ret = ext4_ext_shift_path_extents(path, shift, inode, +				handle, &start); +		ext4_ext_drop_refs(path); +		kfree(path); +		if (ret) +			break; +	} + +	return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ +	struct super_block *sb = inode->i_sb; +	ext4_lblk_t punch_start, punch_stop; +	handle_t *handle; +	unsigned int credits; +	loff_t new_size, ioffset; +	int ret; + +	/* Collapse range works only on fs block size aligned offsets. */ +	if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || +	    len & (EXT4_BLOCK_SIZE(sb) - 1)) +		return -EINVAL; + +	if (!S_ISREG(inode->i_mode)) +		return -EINVAL; + +	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) +		return -EOPNOTSUPP; + +	trace_ext4_collapse_range(inode, offset, len); + +	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); +	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + +	/* Call ext4_force_commit to flush all data in case of data=journal. */ +	if (ext4_should_journal_data(inode)) { +		ret = ext4_force_commit(inode->i_sb); +		if (ret) +			return ret; +	} + +	/* +	 * Need to round down offset to be aligned with page size boundary +	 * for page size > block size. +	 */ +	ioffset = round_down(offset, PAGE_SIZE); + +	/* Write out all dirty pages */ +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +					   LLONG_MAX); +	if (ret) +		return ret; + +	/* Take mutex lock */ +	mutex_lock(&inode->i_mutex); + +	/* +	 * There is no need to overlap collapse range with EOF, in which case +	 * it is effectively a truncate operation +	 */ +	if (offset + len >= i_size_read(inode)) { +		ret = -EINVAL; +		goto out_mutex; +	} + +	/* Currently just for extent based files */ +	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { +		ret = -EOPNOTSUPP; +		goto out_mutex; +	} + +	truncate_pagecache(inode, ioffset); + +	/* Wait for existing dio to complete */ +	ext4_inode_block_unlocked_dio(inode); +	inode_dio_wait(inode); + +	credits = ext4_writepage_trans_blocks(inode); +	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); +	if (IS_ERR(handle)) { +		ret = PTR_ERR(handle); +		goto out_dio; +	} + +	down_write(&EXT4_I(inode)->i_data_sem); +	ext4_discard_preallocations(inode); + +	ret = ext4_es_remove_extent(inode, punch_start, +				    EXT_MAX_BLOCKS - punch_start); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} + +	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} +	ext4_discard_preallocations(inode); + +	ret = ext4_ext_shift_extents(inode, handle, punch_stop, +				     punch_stop - punch_start); +	if (ret) { +		up_write(&EXT4_I(inode)->i_data_sem); +		goto out_stop; +	} + +	new_size = i_size_read(inode) - len; +	i_size_write(inode, new_size); +	EXT4_I(inode)->i_disksize = new_size; + +	up_write(&EXT4_I(inode)->i_data_sem); +	if (IS_SYNC(inode)) +		ext4_handle_sync(handle); +	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); +	ext4_mark_inode_dirty(handle, inode); + +out_stop: +	ext4_journal_stop(handle); +out_dio: +	ext4_inode_resume_unlocked_dio(inode); +out_mutex: +	mutex_unlock(&inode->i_mutex); +	return ret; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3981ff78395..0b7e28e7eaa 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)  	while (node) {  		struct extent_status *es;  		es = rb_entry(node, struct extent_status, rb_node); -		printk(KERN_DEBUG " [%u/%u) %llu %llx", +		printk(KERN_DEBUG " [%u/%u) %llu %x",  		       es->es_lblk, es->es_len,  		       ext4_es_pblock(es), ext4_es_status(es));  		node = rb_next(node); @@ -344,8 +344,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1,  	if (ext4_es_status(es1) != ext4_es_status(es2))  		return 0; -	if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) +	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { +		pr_warn("ES assertion failed when merging extents. " +			"The sum of lengths of es1 (%d) and es2 (%d) " +			"is bigger than allowed file size (%d)\n", +			es1->es_len, es2->es_len, EXT_MAX_BLOCKS); +		WARN_ON(1);  		return 0; +	}  	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)  		return 0; @@ -433,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		ee_start = ext4_ext_pblock(ex);  		ee_len = ext4_ext_get_actual_len(ex); -		ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; +		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;  		es_status = ext4_es_is_unwritten(es) ? 1 : 0;  		/* @@ -445,8 +451,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  				pr_warn("ES insert assertion failed for "  					"inode: %lu we can find an extent "  					"at block [%d/%d/%llu/%c], but we " -					"want to add an delayed/hole extent " -					"[%d/%d/%llu/%llx]\n", +					"want to add a delayed/hole extent " +					"[%d/%d/%llu/%x]\n",  					inode->i_ino, ee_block, ee_len,  					ee_start, ee_status ? 'u' : 'w',  					es->es_lblk, es->es_len, @@ -486,8 +492,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,  		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {  			pr_warn("ES insert assertion failed for inode: %lu "  				"can't find an extent at block %d but we want " -				"to add an written/unwritten extent " -				"[%d/%d/%llu/%llx]\n", inode->i_ino, +				"to add a written/unwritten extent " +				"[%d/%d/%llu/%x]\n", inode->i_ino,  				es->es_lblk, es->es_lblk, es->es_len,  				ext4_es_pblock(es), ext4_es_status(es));  		} @@ -524,7 +530,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,  			 */  			pr_warn("ES insert assertion failed for inode: %lu "  				"We can find blocks but we want to add a " -				"delayed/hole extent [%d/%d/%llu/%llx]\n", +				"delayed/hole extent [%d/%d/%llu/%x]\n",  				inode->i_ino, es->es_lblk, es->es_len,  				ext4_es_pblock(es), ext4_es_status(es));  			return; @@ -554,7 +560,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,  		if (ext4_es_is_written(es)) {  			pr_warn("ES insert assertion failed for inode: %lu "  				"We can't find the block but we want to add " -				"an written extent [%d/%d/%llu/%llx]\n", +				"a written extent [%d/%d/%llu/%x]\n",  				inode->i_ino, es->es_lblk, es->es_len,  				ext4_es_pblock(es), ext4_es_status(es));  			return; @@ -658,8 +664,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,  	newes.es_lblk = lblk;  	newes.es_len = len; -	ext4_es_store_pblock(&newes, pblk); -	ext4_es_store_status(&newes, status); +	ext4_es_store_pblock_status(&newes, pblk, status);  	trace_ext4_es_insert_extent(inode, &newes);  	ext4_es_insert_extent_check(inode, &newes); @@ -699,8 +704,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,  	newes.es_lblk = lblk;  	newes.es_len = len; -	ext4_es_store_pblock(&newes, pblk); -	ext4_es_store_status(&newes, status); +	ext4_es_store_pblock_status(&newes, pblk, status);  	trace_ext4_es_cache_extent(inode, &newes);  	if (!len) @@ -812,13 +816,13 @@ retry:  			newes.es_lblk = end + 1;  			newes.es_len = len2; +			block = 0x7FDEADBEEFULL;  			if (ext4_es_is_written(&orig_es) || -			    ext4_es_is_unwritten(&orig_es)) { +			    ext4_es_is_unwritten(&orig_es))  				block = ext4_es_pblock(&orig_es) +  					orig_es.es_len - len2; -				ext4_es_store_pblock(&newes, block); -			} -			ext4_es_store_status(&newes, ext4_es_status(&orig_es)); +			ext4_es_store_pblock_status(&newes, block, +						    ext4_es_status(&orig_es));  			err = __es_insert_extent(inode, &newes);  			if (err) {  				es->es_lblk = orig_es.es_lblk; @@ -962,10 +966,10 @@ retry:  			continue;  		} -		if (ei->i_es_lru_nr == 0 || ei == locked_ei) +		if (ei->i_es_lru_nr == 0 || ei == locked_ei || +		    !write_trylock(&ei->i_es_lock))  			continue; -		write_lock(&ei->i_es_lock);  		shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);  		if (ei->i_es_lru_nr == 0)  			list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 167f4ab8ecc..f1b62a41992 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,  		       (es->es_pblk & ~ES_MASK));  } +static inline void ext4_es_store_pblock_status(struct extent_status *es, +					       ext4_fsblk_t pb, +					       unsigned int status) +{ +	es->es_pblk = (((ext4_fsblk_t) +			(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | +		       (pb & ~ES_MASK)); +} +  extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);  extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);  extern void ext4_es_lru_add(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3da21945ff1..8695f70af1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)  	return 0;  } -void ext4_unwritten_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode)  {  	wait_queue_head_t *wq = ext4_ioend_wq(inode); @@ -74,132 +74,126 @@ void ext4_unwritten_wait(struct inode *inode)   * or one thread will zero the other's data, causing corruption.   */  static int -ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, -		   unsigned long nr_segs, loff_t pos) +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)  {  	struct super_block *sb = inode->i_sb;  	int blockmask = sb->s_blocksize - 1; -	size_t count = iov_length(iov, nr_segs); -	loff_t final_size = pos + count; -	if (pos >= inode->i_size) +	if (pos >= i_size_read(inode))  		return 0; -	if ((pos & blockmask) || (final_size & blockmask)) +	if ((pos | iov_iter_alignment(from)) & blockmask)  		return 1;  	return 0;  }  static ssize_t -ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, -		    unsigned long nr_segs, loff_t pos) +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_mapping->host; +	struct inode *inode = file_inode(iocb->ki_filp); +	struct mutex *aio_mutex = NULL;  	struct blk_plug plug; -	int unaligned_aio = 0; -	ssize_t ret; +	int o_direct = file->f_flags & O_DIRECT;  	int overwrite = 0; -	size_t length = iov_length(iov, nr_segs); - -	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && -	    !is_sync_kiocb(iocb)) -		unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); +	size_t length = iov_iter_count(from); +	ssize_t ret; +	loff_t pos = iocb->ki_pos; -	/* Unaligned direct AIO must be serialized; see comment above */ -	if (unaligned_aio) { -		mutex_lock(ext4_aio_mutex(inode)); +	/* +	 * Unaligned direct AIO must be serialized; see comment above +	 * In the case of O_APPEND, assume that we must always serialize +	 */ +	if (o_direct && +	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && +	    !is_sync_kiocb(iocb) && +	    (file->f_flags & O_APPEND || +	     ext4_unaligned_aio(inode, from, pos))) { +		aio_mutex = ext4_aio_mutex(inode); +		mutex_lock(aio_mutex);  		ext4_unwritten_wait(inode);  	} -	BUG_ON(iocb->ki_pos != pos); -  	mutex_lock(&inode->i_mutex); -	blk_start_plug(&plug); - -	iocb->private = &overwrite; +	if (file->f_flags & O_APPEND) +		iocb->ki_pos = pos = i_size_read(inode); -	/* check whether we do a DIO overwrite or not */ -	if (ext4_should_dioread_nolock(inode) && !unaligned_aio && -	    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { -		struct ext4_map_blocks map; -		unsigned int blkbits = inode->i_blkbits; -		int err, len; +	/* +	 * If we have encountered a bitmap-format file, the size limit +	 * is smaller than s_maxbytes, which is for extent-mapped files. +	 */ +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { +		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); -		map.m_lblk = pos >> blkbits; -		map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) -			- map.m_lblk; -		len = map.m_len; +		if ((pos > sbi->s_bitmap_maxbytes) || +		    (pos == sbi->s_bitmap_maxbytes && length > 0)) { +			mutex_unlock(&inode->i_mutex); +			ret = -EFBIG; +			goto errout; +		} -		err = ext4_map_blocks(NULL, inode, &map, 0); -		/* -		 * 'err==len' means that all of blocks has been preallocated no -		 * matter they are initialized or not.  For excluding -		 * uninitialized extents, we need to check m_flags.  There are -		 * two conditions that indicate for initialized extents. -		 * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; -		 * 2) If we do a real lookup, non-flags are returned. -		 * So we should check these two conditions. -		 */ -		if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) -			overwrite = 1; +		if (pos + length > sbi->s_bitmap_maxbytes) +			iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);  	} -	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); -	mutex_unlock(&inode->i_mutex); - -	if (ret > 0) { -		ssize_t err; - -		err = generic_write_sync(file, pos, ret); -		if (err < 0 && ret > 0) -			ret = err; -	} -	blk_finish_plug(&plug); +	if (o_direct) { +		blk_start_plug(&plug); -	if (unaligned_aio) -		mutex_unlock(ext4_aio_mutex(inode)); +		iocb->private = &overwrite; -	return ret; -} +		/* check whether we do a DIO overwrite or not */ +		if (ext4_should_dioread_nolock(inode) && !aio_mutex && +		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { +			struct ext4_map_blocks map; +			unsigned int blkbits = inode->i_blkbits; +			int err, len; -static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, -		unsigned long nr_segs, loff_t pos) -{ -	struct inode *inode = file_inode(iocb->ki_filp); -	ssize_t ret; +			map.m_lblk = pos >> blkbits; +			map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) +				- map.m_lblk; +			len = map.m_len; -	/* -	 * If we have encountered a bitmap-format file, the size limit -	 * is smaller than s_maxbytes, which is for extent-mapped files. -	 */ +			err = ext4_map_blocks(NULL, inode, &map, 0); +			/* +			 * 'err==len' means that all of blocks has +			 * been preallocated no matter they are +			 * initialized or not.  For excluding +			 * unwritten extents, we need to check +			 * m_flags.  There are two conditions that +			 * indicate for initialized extents.  1) If we +			 * hit extent cache, EXT4_MAP_MAPPED flag is +			 * returned; 2) If we do a real lookup, +			 * non-flags are returned.  So we should check +			 * these two conditions. +			 */ +			if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) +				overwrite = 1; +		} +	} -	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { -		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); -		size_t length = iov_length(iov, nr_segs); +	ret = __generic_file_write_iter(iocb, from); +	mutex_unlock(&inode->i_mutex); -		if ((pos > sbi->s_bitmap_maxbytes || -		    (pos == sbi->s_bitmap_maxbytes && length > 0))) -			return -EFBIG; +	if (ret > 0) { +		ssize_t err; -		if (pos + length > sbi->s_bitmap_maxbytes) { -			nr_segs = iov_shorten((struct iovec *)iov, nr_segs, -					      sbi->s_bitmap_maxbytes - pos); -		} +		err = generic_write_sync(file, iocb->ki_pos - ret, ret); +		if (err < 0) +			ret = err;  	} +	if (o_direct) +		blk_finish_plug(&plug); -	if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) -		ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); -	else -		ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - +errout: +	if (aio_mutex) +		mutex_unlock(aio_mutex);  	return ret;  }  static const struct vm_operations_struct ext4_file_vm_ops = {  	.fault		= filemap_fault, +	.map_pages	= filemap_map_pages,  	.page_mkwrite   = ext4_page_mkwrite,  	.remap_pages	= generic_file_remap_pages,  }; @@ -243,6 +237,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)  			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);  			if (IS_ERR(handle))  				return PTR_ERR(handle); +			BUFFER_TRACE(sbi->s_sbh, "get_write_access");  			err = ext4_journal_get_write_access(handle, sbi->s_sbh);  			if (err) {  				ext4_journal_stop(handle); @@ -592,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)  const struct file_operations ext4_file_operations = {  	.llseek		= ext4_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -	.aio_read	= generic_file_aio_read, -	.aio_write	= ext4_file_write, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= generic_file_read_iter, +	.write_iter	= ext4_file_write_iter,  	.unlocked_ioctl = ext4_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= ext4_compat_ioctl, @@ -605,7 +600,7 @@ const struct file_operations ext4_file_operations = {  	.release	= ext4_release_file,  	.fsync		= ext4_sync_file,  	.splice_read	= generic_file_splice_read, -	.splice_write	= generic_file_splice_write, +	.splice_write	= iter_file_splice_write,  	.fallocate	= ext4_fallocate,  }; @@ -617,6 +612,7 @@ const struct inode_operations ext4_file_inode_operations = {  	.listxattr	= ext4_listxattr,  	.removexattr	= generic_removexattr,  	.get_acl	= ext4_get_acl, +	.set_acl	= ext4_set_acl,  	.fiemap		= ext4_fiemap,  }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 137193ff389..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,  				       struct ext4_group_desc *gdp)  {  	struct ext4_group_info *grp; +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	J_ASSERT_BH(bh, buffer_locked(bh));  	/* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,  	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {  		ext4_error(sb, "Checksum bad for group %u", block_group);  		grp = ext4_get_group_info(sb, block_group); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free);  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, gdp); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		}  		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);  		return 0;  	} @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  	struct buffer_head *bh = NULL;  	ext4_fsblk_t bitmap_blk;  	struct ext4_group_info *grp; +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	desc = ext4_get_group_desc(sb, block_group, NULL);  	if (!desc) @@ -185,6 +196,12 @@ verify:  		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "  			   "inode_bitmap = %llu", block_group, bitmap_blk);  		grp = ext4_get_group_info(sb, block_group); +		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, desc); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		}  		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);  		return NULL;  	} @@ -321,6 +338,12 @@ out:  			fatal = err;  	} else {  		ext4_error(sb, "bit already cleared for inode %lu", ino); +		if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, gdp); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		}  		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);  	} @@ -432,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);  			grp = hinfo.hash;  		} else -			get_random_bytes(&grp, sizeof(grp)); +			grp = prandom_u32();  		parent_group = (unsigned)grp % ngroups;  		for (i = 0; i < ngroups; i++) {  			g = (parent_group + i) % ngroups; @@ -851,6 +874,13 @@ got:  		goto out;  	} +	BUFFER_TRACE(group_desc_bh, "get_write_access"); +	err = ext4_journal_get_write_access(handle, group_desc_bh); +	if (err) { +		ext4_std_error(sb, err); +		goto out; +	} +  	/* We may have to initialize the block bitmap if it isn't already */  	if (ext4_has_group_desc_csum(sb) &&  	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got:  		}  	} -	BUFFER_TRACE(group_desc_bh, "get_write_access"); -	err = ext4_journal_get_write_access(handle, group_desc_bh); -	if (err) { -		ext4_std_error(sb, err); -		goto out; -	} -  	/* Update the relevant bg descriptor fields */  	if (ext4_has_group_desc_csum(sb)) {  		int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 594009f5f52..fd69da19482 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,  	return 0;  failed:  	for (; i >= 0; i--) { -		if (i != indirect_blks && branch[i].bh) +		/* +		 * We want to ext4_forget() only freshly allocated indirect +		 * blocks.  Buffer for new_blocks[i-1] is at branch[i].bh and +		 * buffer at branch[0].bh is indirect block / inode already +		 * existing before ext4_alloc_branch() was called. +		 */ +		if (i > 0 && i != indirect_blks && branch[i].bh)  			ext4_forget(handle, 1, inode, branch[i].bh,  				    branch[i].bh->b_blocknr);  		ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -639,8 +645,7 @@ out:   * VFS code falls back into buffered path in that case so we are safe.   */  ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, -			   const struct iovec *iov, loff_t offset, -			   unsigned long nr_segs) +			   struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; @@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,  	handle_t *handle;  	ssize_t ret;  	int orphan = 0; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(iter);  	int retries = 0;  	if (rw == WRITE) { @@ -687,18 +692,17 @@ retry:  			goto locked;  		}  		ret = __blockdev_direct_IO(rw, iocb, inode, -				 inode->i_sb->s_bdev, iov, -				 offset, nr_segs, +				 inode->i_sb->s_bdev, iter, offset,  				 ext4_get_block, NULL, NULL, 0);  		inode_dio_done(inode);  	} else {  locked: -		ret = blockdev_direct_IO(rw, iocb, inode, iov, -				 offset, nr_segs, ext4_get_block); +		ret = blockdev_direct_IO(rw, iocb, inode, iter, +				 offset, ext4_get_block);  		if (unlikely((rw & WRITE) && ret < 0)) {  			loff_t isize = i_size_read(inode); -			loff_t end = offset + iov_length(iov, nr_segs); +			loff_t end = offset + count;  			if (end > isize)  				ext4_truncate_failed_write(inode); @@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,  		blk = *i_data;  		if (level > 0) {  			ext4_lblk_t first2; +			ext4_lblk_t count2; +  			bh = sb_bread(inode->i_sb, le32_to_cpu(blk));  			if (!bh) {  				EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),  						       "Read failure");  				return -EIO;  			} -			first2 = (first > offset) ? first - offset : 0; +			if (first > offset) { +				first2 = first - offset; +				count2 = count; +			} else { +				first2 = 0; +				count2 = count - (offset - first); +			}  			ret = free_hole_blocks(handle, inode, bh,  					       (__le32 *)bh->b_data, level - 1, -					       first2, count - offset, +					       first2, count2,  					       inode->i_sb->s_blocksize >> 2);  			if (ret) {  				brelse(bh); @@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,  		if (level == 0 ||  		    (bh && all_zeroes((__le32 *)bh->b_data,  				      (__le32 *)bh->b_data + addr_per_block))) { -			ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); -			*i_data = 0; +			ext4_free_data(handle, inode, parent_bh, +				       i_data, i_data + 1);  		}  		brelse(bh);  		bh = NULL; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d9ecbf1113a..645205d8ada 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -22,7 +22,7 @@  #define EXT4_INLINE_DOTDOT_OFFSET	2  #define EXT4_INLINE_DOTDOT_SIZE		4 -int ext4_get_inline_size(struct inode *inode) +static int ext4_get_inline_size(struct inode *inode)  {  	if (EXT4_I(inode)->i_inline_off)  		return EXT4_I(inode)->i_inline_size; @@ -211,8 +211,8 @@ out:   * value since it is already handled by ext4_xattr_ibody_inline_set.   * That saves us one memcpy.   */ -void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, -			    void *buffer, loff_t pos, unsigned int len) +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, +				   void *buffer, loff_t pos, unsigned int len)  {  	struct ext4_xattr_entry *entry;  	struct ext4_xattr_ibody_header *header; @@ -264,6 +264,7 @@ static int ext4_create_inline_data(handle_t *handle,  	if (error)  		return error; +	BUFFER_TRACE(is.iloc.bh, "get_write_access");  	error = ext4_journal_get_write_access(handle, is.iloc.bh);  	if (error)  		goto out; @@ -347,6 +348,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,  	if (error == -ENODATA)  		goto out; +	BUFFER_TRACE(is.iloc.bh, "get_write_access");  	error = ext4_journal_get_write_access(handle, is.iloc.bh);  	if (error)  		goto out; @@ -373,8 +375,8 @@ out:  	return error;  } -int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, -			     unsigned int len) +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, +				    unsigned int len)  {  	int ret, size;  	struct ext4_inode_info *ei = EXT4_I(inode); @@ -424,6 +426,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,  	if (error)  		goto out; +	BUFFER_TRACE(is.iloc.bh, "get_write_access");  	error = ext4_journal_get_write_access(handle, is.iloc.bh);  	if (error)  		goto out; @@ -849,15 +852,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  	handle_t *handle;  	struct page *page;  	struct ext4_iloc iloc; +	int retries;  	ret = ext4_get_inode_loc(inode, &iloc);  	if (ret)  		return ret; +retry_journal:  	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle); -		handle = NULL;  		goto out;  	} @@ -867,7 +871,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  	if (inline_size >= pos + len) {  		ret = ext4_prepare_inline_data(handle, inode, pos + len);  		if (ret && ret != -ENOSPC) -			goto out; +			goto out_journal;  	}  	if (ret == -ENOSPC) { @@ -875,6 +879,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  							    inode,  							    flags,  							    fsdata); +		ext4_journal_stop(handle); +		if (ret == -ENOSPC && +		    ext4_should_retry_alloc(inode->i_sb, &retries)) +			goto retry_journal;  		goto out;  	} @@ -887,7 +895,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  	page = grab_cache_page_write_begin(mapping, 0, flags);  	if (!page) {  		ret = -ENOMEM; -		goto out; +		goto out_journal;  	}  	down_read(&EXT4_I(inode)->xattr_sem); @@ -904,16 +912,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,  	up_read(&EXT4_I(inode)->xattr_sem);  	*pagep = page; -	handle = NULL;  	brelse(iloc.bh);  	return 1;  out_release_page:  	up_read(&EXT4_I(inode)->xattr_sem);  	unlock_page(page);  	page_cache_release(page); +out_journal: +	ext4_journal_stop(handle);  out: -	if (handle) -		ext4_journal_stop(handle);  	brelse(iloc.bh);  	return ret;  } @@ -994,17 +1001,16 @@ static int ext4_add_dirent_to_inline(handle_t *handle,  	struct inode	*dir = dentry->d_parent->d_inode;  	const char	*name = dentry->d_name.name;  	int		namelen = dentry->d_name.len; -	unsigned short	reclen;  	int		err;  	struct ext4_dir_entry_2 *de; -	reclen = EXT4_DIR_REC_LEN(namelen);  	err = ext4_find_dest_de(dir, inode, iloc->bh,  				inline_start, inline_size,  				name, namelen, &de);  	if (err)  		return err; +	BUFFER_TRACE(iloc->bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, iloc->bh);  	if (err)  		return err; @@ -1442,6 +1448,7 @@ int ext4_read_inline_dir(struct file *file,  	if (ret < 0)  		goto out; +	ret = 0;  	sb = inode->i_sb;  	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);  	offset = ctx->pos; @@ -1666,6 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle,  				EXT4_MIN_INLINE_DATA_SIZE;  	} +	BUFFER_TRACE(bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, bh);  	if (err)  		goto out; @@ -1838,7 +1846,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,  {  	int error;  	struct ext4_xattr_entry *entry; -	struct ext4_xattr_ibody_header *header;  	struct ext4_inode *raw_inode;  	struct ext4_iloc iloc; @@ -1847,7 +1854,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,  		return error;  	raw_inode = ext4_raw_inode(&iloc); -	header = IHDR(inode, raw_inode);  	entry = (struct ext4_xattr_entry *)((void *)raw_inode +  					    EXT4_I(inode)->i_inline_off);  	if (EXT4_XATTR_LEN(entry->e_name_len) + @@ -1925,9 +1931,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)  		}  		/* Clear the content within i_blocks. */ -		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) -			memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, -					EXT4_MIN_INLINE_DATA_SIZE - i_size); +		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { +			void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; +			memset(p + i_size, 0, +			       EXT4_MIN_INLINE_DATA_SIZE - i_size); +		}  		EXT4_I(inode)->i_inline_size = i_size <  					EXT4_MIN_INLINE_DATA_SIZE ? diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d424d7ac02..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@  #include <linux/slab.h>  #include <linux/ratelimit.h>  #include <linux/aio.h> +#include <linux/bitops.h>  #include "ext4_jbd2.h"  #include "xattr.h" @@ -144,8 +145,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,   */  static int ext4_inode_is_fast_symlink(struct inode *inode)  { -	int ea_blocks = EXT4_I(inode)->i_file_acl ? -		(inode->i_sb->s_blocksize >> 9) : 0; +        int ea_blocks = EXT4_I(inode)->i_file_acl ? +		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + +	if (ext4_has_inline_data(inode)) +		return 0;  	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);  } @@ -214,7 +218,7 @@ void ext4_evict_inode(struct inode *inode)  			jbd2_complete_transaction(journal, commit_tid);  			filemap_write_and_wait(&inode->i_data);  		} -		truncate_inode_pages(&inode->i_data, 0); +		truncate_inode_pages_final(&inode->i_data);  		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  		goto no_delete; @@ -225,7 +229,7 @@ void ext4_evict_inode(struct inode *inode)  	if (ext4_should_order_data(inode))  		ext4_begin_ordered_truncate(inode, 0); -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));  	if (is_bad_inode(inode)) @@ -442,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,  	 * could be converted.  	 */  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read((&EXT4_I(inode)->i_data_sem)); +		down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -488,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,   * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping   * based files   * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block,   * the result buffer head is unmapped. If the create ==1, it will make sure   * the buffer head is mapped.   * @@ -503,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  {  	struct extent_status es;  	int retval; +	int ret = 0;  #ifdef ES_AGGRESSIVE_TEST  	struct ext4_map_blocks orig_map; @@ -514,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		  "logical block %lu\n", inode->i_ino, flags, map->m_len,  		  (unsigned long) map->m_lblk); +	/* +	 * ext4_map_blocks returns an int, and m_len is an unsigned int +	 */ +	if (unlikely(map->m_len > INT_MAX)) +		map->m_len = INT_MAX; + +	/* We can handle the block number less than EXT_MAX_BLOCKS */ +	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) +		return -EIO; +  	/* Lookup extent status tree firstly */  	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {  		ext4_es_lru_add(inode); @@ -543,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  	 * file system block.  	 */  	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read((&EXT4_I(inode)->i_data_sem)); +		down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -552,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  					     EXT4_GET_BLOCKS_KEEP_SIZE);  	}  	if (retval > 0) { -		int ret;  		unsigned int status;  		if (unlikely(retval != map->m_len)) { @@ -579,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  found:  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -		int ret = check_block_validity(inode, map); +		ret = check_block_validity(inode, map);  		if (ret != 0)  			return ret;  	} @@ -596,7 +610,13 @@ found:  	 * with buffer head unmapped.  	 */  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) -		return retval; +		/* +		 * If we need to convert extent to unwritten +		 * we continue and do the actual work in +		 * ext4_ext_map_blocks() +		 */ +		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) +			return retval;  	/*  	 * Here we clear m_flags because after allocating an new extent, @@ -605,12 +625,12 @@ found:  	map->m_flags &= ~EXT4_MAP_FLAGS;  	/* -	 * New blocks allocate and/or writing to uninitialized extent +	 * New blocks allocate and/or writing to unwritten extent  	 * will possibly result in updating i_data, so we take  	 * the write lock of i_data_sem, and call get_blocks()  	 * with create == 1 flag.  	 */ -	down_write((&EXT4_I(inode)->i_data_sem)); +	down_write(&EXT4_I(inode)->i_data_sem);  	/*  	 * if the caller is from delayed allocation writeout path @@ -652,7 +672,6 @@ found:  		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);  	if (retval > 0) { -		int ret;  		unsigned int status;  		if (unlikely(retval != map->m_len)) { @@ -687,7 +706,7 @@ found:  has_zeroout:  	up_write((&EXT4_I(inode)->i_data_sem));  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { -		int ret = check_block_validity(inode, map); +		ret = check_block_validity(inode, map);  		if (ret != 0)  			return ret;  	} @@ -906,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle,  	 */  	if (dirty)  		clear_buffer_dirty(bh); +	BUFFER_TRACE(bh, "get write access");  	ret = ext4_journal_get_write_access(handle, bh);  	if (!ret && dirty)  		ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1206,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file,   */  static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)  { -	int retries = 0;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned int md_needed; @@ -1218,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)  	 * in order to allocate nrblocks  	 * worse case is one extent per block  	 */ -repeat:  	spin_lock(&ei->i_block_reservation_lock);  	/*  	 * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1256,6 @@ repeat:  		ei->i_da_metadata_calc_len = save_len;  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock); -		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			cond_resched(); -			goto repeat; -		}  		return -ENOSPC;  	}  	ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1269,6 @@ repeat:   */  static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)  { -	int retries = 0;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	struct ext4_inode_info *ei = EXT4_I(inode);  	unsigned int md_needed; @@ -1277,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)  	 * in order to allocate nrblocks  	 * worse case is one extent per block  	 */ -repeat:  	spin_lock(&ei->i_block_reservation_lock);  	/*  	 * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1309,6 @@ repeat:  		ei->i_da_metadata_calc_len = save_len;  		ei->i_da_metadata_calc_last_lblock = save_last_lblock;  		spin_unlock(&ei->i_block_reservation_lock); -		if (ext4_should_retry_alloc(inode->i_sb, &retries)) { -			cond_resched(); -			goto repeat; -		}  		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));  		return -ENOSPC;  	} @@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  		ext4_es_lru_add(inode);  		if (ext4_es_is_hole(&es)) {  			retval = 0; -			down_read((&EXT4_I(inode)->i_data_sem)); +			down_read(&EXT4_I(inode)->i_data_sem);  			goto add_delayed;  		} @@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,  	 * Try to see if we can get the block without requesting a new  	 * file system block.  	 */ -	down_read((&EXT4_I(inode)->i_data_sem)); +	down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_has_inline_data(inode)) {  		/*  		 * We will soon create blocks for this page, and let @@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,  	BUG_ON(!ext4_handle_valid(handle));  	if (inline_data) { +		BUFFER_TRACE(inode_bh, "get write access");  		ret = ext4_journal_get_write_access(handle, inode_bh);  		err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -1784,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page,  		ret = err;  	if (!ext4_has_inline_data(inode)) -		ext4_walk_page_buffers(handle, page_bufs, 0, len, +		ext4_walk_page_buffers(NULL, page_bufs, 0, len,  				       NULL, bput_one);  	ext4_set_inode_state(inode, EXT4_STATE_JDATA);  out: @@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page,  	struct buffer_head *page_bufs = NULL;  	struct inode *inode = page->mapping->host;  	struct ext4_io_submit io_submit; +	bool keep_towrite = false;  	trace_ext4_writepage(page);  	size = i_size_read(inode); @@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page,  			unlock_page(page);  			return 0;  		} +		keep_towrite = true;  	}  	if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page,  		unlock_page(page);  		return -ENOMEM;  	} -	ret = ext4_bio_write_page(&io_submit, page, len, wbc); +	ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);  	ext4_io_submit(&io_submit);  	/* Drop io_end reference we got from init */  	ext4_put_io_end_defer(io_submit.io_end); @@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)  	else  		len = PAGE_CACHE_SIZE;  	clear_page_dirty_for_io(page); -	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); +	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);  	if (!err)  		mpd->wbc->nr_to_write--;  	mpd->first_page++; @@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,   * Scan buffers corresponding to changed extent (we expect corresponding pages   * to be already locked) and update buffer state according to new extent state.   * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents   * and do extent conversion after IO is finished. If the last page is not fully   * mapped, we update @map to the next extent in the last page that needs   * mapping. Otherwise we submit the page for IO. @@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	struct inode *inode = mpd->inode;  	struct ext4_map_blocks *map = &mpd->map;  	int get_blocks_flags; -	int err; +	int err, dioread_nolock;  	trace_ext4_da_write_pages_extent(inode, map);  	/*  	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or -	 * to convert an uninitialized extent to be initialized (in the case +	 * to convert an unwritten extent to be initialized (in the case  	 * where we have written into one or more preallocated blocks).  It is  	 * possible that we're going to need more metadata blocks than  	 * previously reserved. However we must not fail because we're in @@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	 */  	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |  			   EXT4_GET_BLOCKS_METADATA_NOFAIL; -	if (ext4_should_dioread_nolock(inode)) +	dioread_nolock = ext4_should_dioread_nolock(inode); +	if (dioread_nolock)  		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;  	if (map->m_flags & (1 << BH_Delay))  		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)  	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);  	if (err < 0)  		return err; -	if (map->m_flags & EXT4_MAP_UNINIT) { +	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {  		if (!mpd->io_submit.io_end->handle &&  		    ext4_handle_valid(handle)) {  			mpd->io_submit.io_end->handle = handle->h_rsv_handle; @@ -2178,6 +2190,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)   *   * @handle - handle for journal operations   * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + *                     is no hope of writing the data. The caller should discard + *                     dirty pages to avoid infinite loops.   *   * The function maps extent starting at mpd->lblk of length mpd->len. If it is   * delayed, blocks are allocated, if it is unwritten, we may need to convert @@ -2240,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,  			return err;  	} while (map->m_len); -	/* Update on-disk size after IO is submitted */ +	/* +	 * Update on-disk size after IO is submitted.  Races with +	 * truncate are avoided by checking i_size under i_data_sem. +	 */  	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;  	if (disksize > EXT4_I(inode)->i_disksize) {  		int err2; - -		ext4_wb_update_i_disksize(inode, disksize); +		loff_t i_size; + +		down_write(&EXT4_I(inode)->i_data_sem); +		i_size = i_size_read(inode); +		if (disksize > i_size) +			disksize = i_size; +		if (disksize > EXT4_I(inode)->i_disksize) +			EXT4_I(inode)->i_disksize = disksize;  		err2 = ext4_mark_inode_dirty(handle, inode); +		up_write(&EXT4_I(inode)->i_data_sem);  		if (err2)  			ext4_error(inode->i_sb,  				   "Failed to mark inode %lu dirty", @@ -2295,6 +2320,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  	struct address_space *mapping = mpd->inode->i_mapping;  	struct pagevec pvec;  	unsigned int nr_pages; +	long left = mpd->wbc->nr_to_write;  	pgoff_t index = mpd->first_page;  	pgoff_t end = mpd->last_page;  	int tag; @@ -2330,6 +2356,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  			if (page->index > end)  				goto out; +			/* +			 * Accumulated enough dirty pages? This doesn't apply +			 * to WB_SYNC_ALL mode. For integrity sync we have to +			 * keep going because someone may be concurrently +			 * dirtying pages, and we might have synced a lot of +			 * newly appeared dirty pages, but have not synced all +			 * of the old dirty pages. +			 */ +			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) +				goto out; +  			/* If we can't merge this page, we are done. */  			if (mpd->map.m_len > 0 && mpd->next_page != page->index)  				goto out; @@ -2364,19 +2401,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  			if (err <= 0)  				goto out;  			err = 0; - -			/* -			 * Accumulated enough dirty pages? This doesn't apply -			 * to WB_SYNC_ALL mode. For integrity sync we have to -			 * keep going because someone may be concurrently -			 * dirtying pages, and we might have synced a lot of -			 * newly appeared dirty pages, but have not synced all -			 * of the old dirty pages. -			 */ -			if (mpd->wbc->sync_mode == WB_SYNC_NONE && -			    mpd->next_page - mpd->first_page >= -							mpd->wbc->nr_to_write) -				goto out; +			left--;  		}  		pagevec_release(&pvec);  		cond_resched(); @@ -2420,16 +2445,15 @@ static int ext4_writepages(struct address_space *mapping,  	 * because that could violate lock ordering on umount  	 */  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) -		return 0; +		goto out_writepages;  	if (ext4_should_journal_data(inode)) {  		struct blk_plug plug; -		int ret;  		blk_start_plug(&plug);  		ret = write_cache_pages(mapping, wbc, __writepage, mapping);  		blk_finish_plug(&plug); -		return ret; +		goto out_writepages;  	}  	/* @@ -2442,8 +2466,10 @@ static int ext4_writepages(struct address_space *mapping,  	 * *never* be called, so if that ever happens, we would want  	 * the stack trace.  	 */ -	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) -		return -EROFS; +	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { +		ret = -EROFS; +		goto out_writepages; +	}  	if (ext4_should_dioread_nolock(inode)) {  		/* @@ -2563,7 +2589,7 @@ retry:  			break;  	}  	blk_finish_plug(&plug); -	if (!ret && !cycled) { +	if (!ret && !cycled && wbc->nr_to_write > 0) {  		cycled = 1;  		mpd.last_page = writeback_index - 1;  		mpd.first_page = 0; @@ -3052,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,   * preallocated extents, and those write extend the file, no need to   * fall back to buffered IO.   * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten   * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten.   *   * The unwritten extents will be converted to written when DIO is completed.   * For async direct IO, since the IO may still pending when return, we @@ -3067,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,   *   */  static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, -			      const struct iovec *iov, loff_t offset, -			      unsigned long nr_segs) +			      struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host;  	ssize_t ret; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(iter);  	int overwrite = 0;  	get_block_t *get_block_func = NULL;  	int dio_flags = 0; @@ -3082,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	/* Use the old path for reads and writes beyond i_size. */  	if (rw != WRITE || final_size > inode->i_size) -		return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +		return ext4_ind_direct_IO(rw, iocb, iter, offset);  	BUG_ON(iocb->private == NULL); @@ -3106,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  	 * We could direct write to holes and fallocate.  	 *  	 * Allocated blocks to fill the hole are marked as -	 * uninitialized to prevent parallel buffered read to expose +	 * unwritten to prevent parallel buffered read to expose  	 * the stale data before DIO complete the data IO.  	 *  	 * As to previously fallocated extents, ext4 get_block will  	 * just simply mark the buffer mapped but still keep the -	 * extents uninitialized. +	 * extents unwritten.  	 *  	 * For non AIO case, we will convert those unwritten extents  	 * to written after return back from blockdev_direct_IO. @@ -3149,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		dio_flags = DIO_LOCKING;  	}  	ret = __blockdev_direct_IO(rw, iocb, inode, -				   inode->i_sb->s_bdev, iov, -				   offset, nr_segs, +				   inode->i_sb->s_bdev, iter, +				   offset,  				   get_block_func,  				   ext4_end_io_dio,  				   NULL, @@ -3204,11 +3229,11 @@ retake_lock:  }  static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, -			      const struct iovec *iov, loff_t offset, -			      unsigned long nr_segs) +			      struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; +	size_t count = iov_iter_count(iter);  	ssize_t ret;  	/* @@ -3221,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,  	if (ext4_has_inline_data(inode))  		return 0; -	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); +	trace_ext4_direct_IO_enter(inode, offset, count, rw);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) -		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); +		ret = ext4_ext_direct_IO(rw, iocb, iter, offset);  	else -		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); -	trace_ext4_direct_IO_exit(inode, offset, -				iov_length(iov, nr_segs), rw, ret); +		ret = ext4_ind_direct_IO(rw, iocb, iter, offset); +	trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);  	return ret;  } @@ -3320,33 +3344,13 @@ void ext4_set_aops(struct inode *inode)  }  /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, -		struct address_space *mapping, loff_t from) -{ -	unsigned offset = from & (PAGE_CACHE_SIZE-1); -	unsigned length; -	unsigned blocksize; -	struct inode *inode = mapping->host; - -	blocksize = inode->i_sb->s_blocksize; -	length = blocksize - (offset & (blocksize - 1)); - -	return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/*   * ext4_block_zero_page_range() zeros out a mapping of length 'length'   * starting from file offset 'from'.  The range to be zero'd must   * be contained with in one block.  If the specified range exceeds   * the end of the block it will be shortened to end of the block   * that cooresponds to 'from'   */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle,  		struct address_space *mapping, loff_t from, loff_t length)  {  	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3436,6 +3440,26 @@ unlock:  	return err;  } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, +		struct address_space *mapping, loff_t from) +{ +	unsigned offset = from & (PAGE_CACHE_SIZE-1); +	unsigned length; +	unsigned blocksize; +	struct inode *inode = mapping->host; + +	blocksize = inode->i_sb->s_blocksize; +	length = blocksize - (offset & (blocksize - 1)); + +	return ext4_block_zero_page_range(handle, mapping, from, length); +} +  int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,  			     loff_t lstart, loff_t length)  { @@ -3509,12 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	if (!S_ISREG(inode->i_mode))  		return -EOPNOTSUPP; -	if (EXT4_SB(sb)->s_cluster_ratio > 1) { -		/* TODO: Add support for bigalloc file systems */ -		return -EOPNOTSUPP; -	} - -	trace_ext4_punch_hole(inode, offset, length); +	trace_ext4_punch_hole(inode, offset, length, 0);  	/*  	 * Write out all dirty pages to avoid race conditions @@ -3528,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	}  	mutex_lock(&inode->i_mutex); -	/* It's not possible punch hole on append only file */ -	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { -		ret = -EPERM; -		goto out_mutex; -	} -	if (IS_SWAPFILE(inode)) { -		ret = -ETXTBSY; -		goto out_mutex; -	}  	/* No need to punch hole beyond i_size */  	if (offset >= inode->i_size) @@ -3617,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  		ret = ext4_free_hole_blocks(handle, inode, first_block,  					    stop_block); -	ext4_discard_preallocations(inode);  	up_write(&EXT4_I(inode)->i_data_sem);  	if (IS_SYNC(inode))  		ext4_handle_sync(handle); + +	/* Now release the pages again to reduce race window */ +	if (last_block_offset > first_block_offset) +		truncate_pagecache_range(inode, first_block_offset, +					 last_block_offset); +  	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);  	ext4_mark_inode_dirty(handle, inode);  out_stop: @@ -3694,7 +3709,7 @@ void ext4_truncate(struct inode *inode)  	/*  	 * There is a possibility that we're either freeing the inode -	 * or it completely new indode. In those cases we might not +	 * or it's a completely new inode. In those cases we might not  	 * have i_mutex locked because it's not necessary.  	 */  	if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)  void ext4_set_inode_flags(struct inode *inode)  {  	unsigned int flags = EXT4_I(inode)->i_flags; +	unsigned int new_fl = 0; -	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);  	if (flags & EXT4_SYNC_FL) -		inode->i_flags |= S_SYNC; +		new_fl |= S_SYNC;  	if (flags & EXT4_APPEND_FL) -		inode->i_flags |= S_APPEND; +		new_fl |= S_APPEND;  	if (flags & EXT4_IMMUTABLE_FL) -		inode->i_flags |= S_IMMUTABLE; +		new_fl |= S_IMMUTABLE;  	if (flags & EXT4_NOATIME_FL) -		inode->i_flags |= S_NOATIME; +		new_fl |= S_NOATIME;  	if (flags & EXT4_DIRSYNC_FL) -		inode->i_flags |= S_DIRSYNC; +		new_fl |= S_DIRSYNC; +	inode_set_flags(inode, new_fl, +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);  }  /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4164,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);  	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); -	inode->i_version = le32_to_cpu(raw_inode->i_disk_version); -	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { -		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -			inode->i_version |= -			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { +		inode->i_version = le32_to_cpu(raw_inode->i_disk_version); +		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { +			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +				inode->i_version |= +		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; +		}  	}  	ret = 0; @@ -4291,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle,  	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);  	struct ext4_inode_info *ei = EXT4_I(inode);  	struct buffer_head *bh = iloc->bh; +	struct super_block *sb = inode->i_sb;  	int err = 0, rc, block; -	int need_datasync = 0; +	int need_datasync = 0, set_large_file = 0;  	uid_t i_uid;  	gid_t i_gid; -	/* For fields not not tracking in the in-memory inode, +	spin_lock(&ei->i_raw_lock); + +	/* For fields not tracked in the in-memory inode,  	 * initialise them to zero for new inodes. */  	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))  		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4334,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle,  	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);  	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); -	if (ext4_inode_blocks_set(handle, raw_inode, ei)) +	if (ext4_inode_blocks_set(handle, raw_inode, ei)) { +		spin_unlock(&ei->i_raw_lock);  		goto out_brelse; +	}  	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);  	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); -	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != -	    cpu_to_le32(EXT4_OS_HURD)) +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))  		raw_inode->i_file_acl_high =  			cpu_to_le16(ei->i_file_acl >> 32);  	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4348,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle,  		need_datasync = 1;  	}  	if (ei->i_disksize > 0x7fffffffULL) { -		struct super_block *sb = inode->i_sb;  		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,  				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||  				EXT4_SB(sb)->s_es->s_rev_level == -				cpu_to_le32(EXT4_GOOD_OLD_REV)) { -			/* If this is the first large file -			 * created, add a flag to the superblock. -			 */ -			err = ext4_journal_get_write_access(handle, -					EXT4_SB(sb)->s_sbh); -			if (err) -				goto out_brelse; -			ext4_update_dynamic_rev(sb); -			EXT4_SET_RO_COMPAT_FEATURE(sb, -					EXT4_FEATURE_RO_COMPAT_LARGE_FILE); -			ext4_handle_sync(handle); -			err = ext4_handle_dirty_super(handle, sb); -		} +		    cpu_to_le32(EXT4_GOOD_OLD_REV)) +			set_large_file = 1;  	}  	raw_inode->i_generation = cpu_to_le32(inode->i_generation);  	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4384,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle,  			raw_inode->i_block[block] = ei->i_data[block];  	} -	raw_inode->i_disk_version = cpu_to_le32(inode->i_version); -	if (ei->i_extra_isize) { -		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) -			raw_inode->i_version_hi = -			cpu_to_le32(inode->i_version >> 32); -		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); +	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { +		raw_inode->i_disk_version = cpu_to_le32(inode->i_version); +		if (ei->i_extra_isize) { +			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +				raw_inode->i_version_hi = +					cpu_to_le32(inode->i_version >> 32); +			raw_inode->i_extra_isize = +				cpu_to_le16(ei->i_extra_isize); +		}  	}  	ext4_inode_csum_set(inode, raw_inode, ei); +	spin_unlock(&ei->i_raw_lock); +  	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");  	rc = ext4_handle_dirty_metadata(handle, NULL, bh);  	if (!err)  		err = rc;  	ext4_clear_inode_state(inode, EXT4_STATE_NEW); - +	if (set_large_file) { +		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); +		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); +		if (err) +			goto out_brelse; +		ext4_update_dynamic_rev(sb); +		EXT4_SET_RO_COMPAT_FEATURE(sb, +					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE); +		ext4_handle_sync(handle); +		err = ext4_handle_dirty_super(handle, sb); +	}  	ext4_update_inode_fsync_trans(handle, inode, need_datasync);  out_brelse:  	brelse(bh); @@ -4412,21 +4437,20 @@ out_brelse:   *   * We are called from a few places:   * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.   *   Here, there will be no transaction running. We wait for any running   *   transaction to commit.   * - * - Within sys_sync(), kupdate and such. - *   We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + *   We wait on commit, if told to.   * - * - Within prune_icache() (PF_MEMALLOC == true) - *   Here we simply return.  We can't afford to block kswapd on the - *   journal commit. + * - Within iput_final() -> write_inode_now() + *   We wait on commit, if told to.   *   * In all cases it is actually safe for us to return without doing anything,   * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL + * writeback.   *   * Note that we are absolutely dependent upon all inode dirtiers doing the   * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4438,15 +4462,15 @@ out_brelse:   *	stuff();   *	inode->i_size = expr;   * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost.  Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost.  Plus the inode will no longer be on the + * superblock's dirty inode list.   */  int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  {  	int err; -	if (current->flags & PF_MEMALLOC) +	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))  		return 0;  	if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4456,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  			return -EIO;  		} -		if (wbc->sync_mode != WB_SYNC_ALL) +		/* +		 * No need to force transaction in WB_SYNC_NONE mode. Also +		 * ext4_sync_fs() will force the commit after everything is +		 * written. +		 */ +		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)  			return 0;  		err = ext4_force_commit(inode->i_sb); @@ -4466,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)  		err = __ext4_get_inode_loc(inode, &iloc, 0);  		if (err)  			return err; -		if (wbc->sync_mode == WB_SYNC_ALL) +		/* +		 * sync(2) will flush the whole buffer cache. No need to do +		 * it here separately for each inode. +		 */ +		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)  			sync_dirty_buffer(iloc.bh);  		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {  			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, @@ -4594,6 +4627,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  			if (attr->ia_size > sbi->s_bitmap_maxbytes)  				return -EFBIG;  		} + +		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) +			inode_inc_iversion(inode); +  		if (S_ISREG(inode->i_mode) &&  		    (attr->ia_size < inode->i_size)) {  			if (ext4_should_order_data(inode)) { @@ -4671,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  		ext4_orphan_del(NULL, inode);  	if (!rc && (ia_valid & ATTR_MODE)) -		rc = ext4_acl_chmod(inode); +		rc = posix_acl_chmod(inode, inode->i_mode);  err_out:  	ext4_std_error(inode->i_sb, error); @@ -4690,6 +4727,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	generic_fillattr(inode, stat);  	/* +	 * If there is inline data in the inode, the inode will normally not +	 * have data blocks allocated (it may have an external xattr block). +	 * Report at least one sector for such files, so tools like tar, rsync, +	 * others doen't incorrectly think the file is completely sparse. +	 */ +	if (unlikely(ext4_has_inline_data(inode))) +		stat->blocks += (stat->size + 511) >> 9; + +	/*  	 * We can't update i_blocks if the block allocation is delayed  	 * otherwise in the case of system crash before the real block  	 * allocation is done, we will have i_blocks inconsistent with @@ -4700,9 +4746,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	 * blocks for this file.  	 */  	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), -				EXT4_I(inode)->i_reserved_data_blocks); - -	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); +				   EXT4_I(inode)->i_reserved_data_blocks); +	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);  	return 0;  } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a569d335f80..0f2252ec274 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -101,28 +101,18 @@ static long swap_inode_boot_loader(struct super_block *sb,  	handle_t *handle;  	int err;  	struct inode *inode_bl; -	struct ext4_inode_info *ei;  	struct ext4_inode_info *ei_bl; -	struct ext4_sb_info *sbi; +	struct ext4_sb_info *sbi = EXT4_SB(sb); -	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { -		err = -EINVAL; -		goto swap_boot_out; -	} - -	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { -		err = -EPERM; -		goto swap_boot_out; -	} +	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) +		return -EINVAL; -	sbi = EXT4_SB(sb); -	ei = EXT4_I(inode); +	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) +		return -EPERM;  	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); -	if (IS_ERR(inode_bl)) { -		err = PTR_ERR(inode_bl); -		goto swap_boot_out; -	} +	if (IS_ERR(inode_bl)) +		return PTR_ERR(inode_bl);  	ei_bl = EXT4_I(inode_bl);  	filemap_flush(inode->i_mapping); @@ -130,7 +120,7 @@ static long swap_inode_boot_loader(struct super_block *sb,  	/* Protect orig inodes against a truncate and make sure,  	 * that only 1 swap_inode_boot_loader is running. */ -	ext4_inode_double_lock(inode, inode_bl); +	lock_two_nondirectories(inode, inode_bl);  	truncate_inode_pages(&inode->i_data, 0);  	truncate_inode_pages(&inode_bl->i_data, 0); @@ -144,7 +134,7 @@ static long swap_inode_boot_loader(struct super_block *sb,  	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);  	if (IS_ERR(handle)) {  		err = -EINVAL; -		goto swap_boot_out; +		goto journal_err_out;  	}  	/* Protect extent tree against block allocations via delalloc */ @@ -197,19 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,  			ext4_mark_inode_dirty(handle, inode);  		}  	} -  	ext4_journal_stop(handle); -  	ext4_double_up_write_data_sem(inode, inode_bl); +journal_err_out:  	ext4_inode_resume_unlocked_dio(inode);  	ext4_inode_resume_unlocked_dio(inode_bl); - -	ext4_inode_double_unlock(inode, inode_bl); - +	unlock_two_nondirectories(inode, inode_bl);  	iput(inode_bl); - -swap_boot_out:  	return err;  } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a41e3ba8cfa..2dcb936be90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,  				void *buddy, void *bitmap, ext4_group_t group)  {  	struct ext4_group_info *grp = ext4_get_group_info(sb, group); +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);  	ext4_grpblk_t i = 0;  	ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb,  	if (free != grp->bb_free) {  		ext4_grp_locked_error(sb, group, 0, 0, -				      "%u clusters in bitmap, %u in gd; " -				      "block bitmap corrupt.", +				      "block bitmap and bg descriptor " +				      "inconsistent: %u vs %u free clusters",  				      free, grp->bb_free);  		/*  		 * If we intend to continue, we consider group descriptor  		 * corrupt and update bb_free using bitmap value  		 */  		grp->bb_free = free; +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free);  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);  	}  	mb_set_largest_free_order(sb, grp); @@ -989,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,  	poff = block % blocks_per_page;  	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);  	if (!page) -		return -EIO; +		return -ENOMEM;  	BUG_ON(page->mapping != inode->i_mapping);  	e4b->bd_bitmap_page = page;  	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1007,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,  	pnum = block / blocks_per_page;  	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);  	if (!page) -		return -EIO; +		return -ENOMEM;  	BUG_ON(page->mapping != inode->i_mapping);  	e4b->bd_buddy_page = page;  	return 0; @@ -1044,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)  	 * allocating. If we are looking at the buddy cache we would  	 * have taken a reference using ext4_mb_load_buddy and that  	 * would have pinned buddy page to page cache. +	 * The call to ext4_mb_get_buddy_page_lock will mark the +	 * page accessed.  	 */  	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);  	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { @@ -1062,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)  		ret = -EIO;  		goto err;  	} -	mark_page_accessed(page);  	if (e4b.bd_buddy_page == NULL) {  		/* @@ -1082,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)  		ret = -EIO;  		goto err;  	} -	mark_page_accessed(page);  err:  	ext4_mb_put_buddy_page_lock(&e4b);  	return ret; @@ -1141,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,  	/* we could use find_or_create_page(), but it locks page  	 * what we'd like to avoid in fast path ... */ -	page = find_get_page(inode->i_mapping, pnum); +	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);  	if (page == NULL || !PageUptodate(page)) {  		if (page)  			/* @@ -1168,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,  			unlock_page(page);  		}  	} -	if (page == NULL || !PageUptodate(page)) { +	if (page == NULL) { +		ret = -ENOMEM; +		goto err; +	} +	if (!PageUptodate(page)) {  		ret = -EIO;  		goto err;  	} + +	/* Pages marked accessed already */  	e4b->bd_bitmap_page = page;  	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -	mark_page_accessed(page);  	block++;  	pnum = block / blocks_per_page;  	poff = block % blocks_per_page; -	page = find_get_page(inode->i_mapping, pnum); +	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);  	if (page == NULL || !PageUptodate(page)) {  		if (page)  			page_cache_release(page); @@ -1197,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,  			unlock_page(page);  		}  	} -	if (page == NULL || !PageUptodate(page)) { +	if (page == NULL) { +		ret = -ENOMEM; +		goto err; +	} +	if (!PageUptodate(page)) {  		ret = -EIO;  		goto err;  	} + +	/* Pages marked accessed already */  	e4b->bd_buddy_page = page;  	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -	mark_page_accessed(page);  	BUG_ON(e4b->bd_bitmap_page == NULL);  	BUG_ON(e4b->bd_buddy_page == NULL); @@ -1421,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,  		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);  	if (unlikely(block != -1)) { +		struct ext4_sb_info *sbi = EXT4_SB(sb);  		ext4_fsblk_t blocknr;  		blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1431,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,  				      "freeing already freed block "  				      "(bit %u); block bitmap corrupt.",  				      block); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   e4b->bd_info->bb_free);  		/* Mark the block group as corrupt. */  		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,  			&e4b->bd_info->bb_state); @@ -1808,6 +1826,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,  	ext4_lock_group(ac->ac_sb, group);  	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,  			     ac->ac_g_ex.fe_len, &ex); +	ex.fe_logical = 0xDEADFA11; /* debug value */  	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {  		ext4_fsblk_t start; @@ -1936,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,  			 */  			break;  		} - +		ex.fe_logical = 0xDEADC0DE; /* debug value */  		ext4_mb_measure_extent(ac, &ex, e4b);  		i += ex.fe_len; @@ -1977,6 +1996,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,  			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);  			if (max >= sbi->s_stripe) {  				ac->ac_found++; +				ex.fe_logical = 0xDEADF00D; /* debug value */  				ac->ac_b_ex = ex;  				ext4_mb_use_best_found(ac, e4b);  				break; @@ -2607,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb)  	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);  	if (sbi->s_locality_groups == NULL) {  		ret = -ENOMEM; -		goto out_free_groupinfo_slab; +		goto out;  	}  	for_each_possible_cpu(i) {  		struct ext4_locality_group *lg; @@ -2632,8 +2652,6 @@ int ext4_mb_init(struct super_block *sb)  out_free_locality_groups:  	free_percpu(sbi->s_locality_groups);  	sbi->s_locality_groups = NULL; -out_free_groupinfo_slab: -	ext4_groupinfo_destroy_slabs();  out:  	kfree(sbi->s_mb_offsets);  	sbi->s_mb_offsets = NULL; @@ -2866,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,  	if (!bitmap_bh)  		goto out_err; +	BUFFER_TRACE(bitmap_bh, "getting write access");  	err = ext4_journal_get_write_access(handle, bitmap_bh);  	if (err)  		goto out_err; @@ -2878,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,  	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,  			ext4_free_group_clusters(sb, gdp)); +	BUFFER_TRACE(gdp_bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, gdp_bh);  	if (err)  		goto out_err; @@ -3135,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,  	}  	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&  			start > ac->ac_o_ex.fe_logical); -	BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); +	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));  	/* now prepare goal request */ @@ -3442,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)  {  	struct ext4_prealloc_space *pa;  	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + +	BUG_ON(atomic_read(&pa->pa_count)); +	BUG_ON(pa->pa_deleted == 0);  	kmem_cache_free(ext4_pspace_cachep, pa);  } @@ -3455,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,  	ext4_group_t grp;  	ext4_fsblk_t grp_blk; -	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) -		return; -  	/* in this short window concurrent discard can set pa_deleted */  	spin_lock(&pa->pa_lock); +	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { +		spin_unlock(&pa->pa_lock); +		return; +	} +  	if (pa->pa_deleted == 1) {  		spin_unlock(&pa->pa_lock);  		return; @@ -4001,8 +4026,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)  			(unsigned long)ac->ac_b_ex.fe_len,  			(unsigned long)ac->ac_b_ex.fe_logical,  			(int)ac->ac_criteria); -	ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", -		 ac->ac_ex_scanned, ac->ac_found); +	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);  	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");  	ngroups = ext4_get_groups_count(sb);  	for (i = 0; i < ngroups; i++) { @@ -4121,7 +4145,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,  	ext4_get_group_no_and_offset(sb, goal, &group, &block);  	/* set up allocation goals */ -	ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); +	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);  	ac->ac_status = AC_STATUS_CONTINUE;  	ac->ac_sb = sb;  	ac->ac_inode = ar->inode; @@ -4663,7 +4687,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  	 * blocks at the beginning or the end unless we are explicitly  	 * requested to avoid doing so.  	 */ -	overflow = block & (sbi->s_cluster_ratio - 1); +	overflow = EXT4_PBLK_COFF(sbi, block);  	if (overflow) {  		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {  			overflow = sbi->s_cluster_ratio - overflow; @@ -4677,7 +4701,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  			count += overflow;  		}  	} -	overflow = count & (sbi->s_cluster_ratio - 1); +	overflow = EXT4_LBLK_COFF(sbi, count);  	if (overflow) {  		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {  			if (count > overflow) @@ -4794,8 +4818,8 @@ do_more:  					 " group:%d block:%d count:%lu failed"  					 " with %d", block_group, bit, count,  					 err); -		} - +		} else +			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);  		ext4_lock_group(sb, block_group);  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -5002,6 +5026,8 @@ error_return:   */  static int ext4_trim_extent(struct super_block *sb, int start, int count,  			     ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock)  {  	struct ext4_free_extent ex;  	int ret = 0; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 08481ee84cd..d634e183b4d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;  		}							\  	} while (0)  #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...)		no_printk(fmt, ## a)  #endif  #define EXT4_MB_HISTORY_ALLOC		1	/* allocation */ @@ -175,8 +175,6 @@ struct ext4_allocation_context {  	/* copy of the best found extent taken before preallocation efforts */  	struct ext4_free_extent ac_f_ex; -	/* number of iterations done. we have to track to limit searching */ -	unsigned long ac_ex_scanned;  	__u16 ac_groups_scanned;  	__u16 ac_found;  	__u16 ac_tail; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 2ae73a80c19..ec092437d3e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -505,7 +505,7 @@ int ext4_ext_migrate(struct inode *inode)  	 * with i_data_sem held to prevent racing with block  	 * allocation.  	 */ -	down_read((&EXT4_I(inode)->i_data_sem)); +	down_read(&EXT4_I(inode)->i_data_sem);  	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);  	up_read((&EXT4_I(inode)->i_data_sem)); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 214461e42a0..32bce844c2e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -18,7 +18,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)  	return cpu_to_le32(csum);  } -int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)  {  	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,  				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -27,7 +27,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)  	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);  } -void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)  {  	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,  				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -259,7 +259,7 @@ static unsigned int mmp_new_seq(void)  	u32 new_seq;  	do { -		get_random_bytes(&new_seq, sizeof(u32)); +		new_seq = prandom_u32();  	} while (new_seq > EXT4_MMP_SEQ_MAX);  	return new_seq; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7fa4d855dbd..2484c7ec6a7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,  static void  copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)  { -	if (ext4_ext_is_uninitialized(src)) -		ext4_ext_mark_uninitialized(dest); +	if (ext4_ext_is_unwritten(src)) +		ext4_ext_mark_unwritten(dest);  	else  		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));  } @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)   * ext4_ext_path structure refers to the last extent, or a negative error   * value on failure.   */ -static int +int  mext_next_extent(struct inode *inode, struct ext4_ext_path *path,  		      struct ext4_extent **extent)  { @@ -391,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,  	if (depth) {  		/* Register to journal */ +		BUFFER_TRACE(orig_path->p_bh, "get_write_access");  		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);  		if (ret)  			return ret; @@ -593,14 +594,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,   * @inode:		inode in question   * @from:		block offset of inode   * @count:		block count to be checked - * @uninit:		extents expected to be uninitialized + * @unwritten:		extents expected to be unwritten   * @err:		pointer to save error value   *   * Return 1 if all extents in range has expected type, and zero otherwise.   */  static int  mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, -			  int uninit, int *err) +		    int unwritten, int *err)  {  	struct ext4_ext_path *path = NULL;  	struct ext4_extent *ext; @@ -611,7 +612,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,  		if (*err)  			goto out;  		ext = path[ext_depth(inode)].p_ext; -		if (uninit != ext4_ext_is_uninitialized(ext)) +		if (unwritten != ext4_ext_is_unwritten(ext))  			goto out;  		from += ext4_ext_get_actual_len(ext);  		ext4_ext_drop_refs(path); @@ -861,8 +862,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)  			}  			if (!buffer_mapped(bh)) {  				zero_user(page, block_start, blocksize); -				if (!err) -					set_buffer_uptodate(bh); +				set_buffer_uptodate(bh);  				continue;  			}  		} @@ -895,7 +895,7 @@ out:   * @orig_page_offset:		page index on original file   * @data_offset_in_page:	block index where data swapping starts   * @block_len_in_page:		the number of blocks to be swapped - * @uninit:			orig extent is uninitialized or not + * @unwritten:			orig extent is unwritten or not   * @err:			pointer to save return value   *   * Save the data in original inode blocks and replace original inode extents @@ -906,7 +906,7 @@ out:  static int  move_extent_per_page(struct file *o_filp, struct inode *donor_inode,  		  pgoff_t orig_page_offset, int data_offset_in_page, -		  int block_len_in_page, int uninit, int *err) +		  int block_len_in_page, int unwritten, int *err)  {  	struct inode *orig_inode = file_inode(o_filp);  	struct page *pagep[2] = {NULL, NULL}; @@ -963,27 +963,27 @@ again:  	if (unlikely(*err < 0))  		goto stop_journal;  	/* -	 * If orig extent was uninitialized it can become initialized +	 * If orig extent was unwritten it can become initialized  	 * at any time after i_data_sem was dropped, in order to  	 * serialize with delalloc we have recheck extent while we  	 * hold page's lock, if it is still the case data copy is not  	 * necessary, just swap data blocks between orig and donor.  	 */ -	if (uninit) { +	if (unwritten) {  		ext4_double_down_write_data_sem(orig_inode, donor_inode);  		/* If any of extents in range became initialized we have to  		 * fallback to data copying */ -		uninit = mext_check_coverage(orig_inode, orig_blk_offset, -					     block_len_in_page, 1, err); +		unwritten = mext_check_coverage(orig_inode, orig_blk_offset, +						block_len_in_page, 1, err);  		if (*err)  			goto drop_data_sem; -		uninit &= mext_check_coverage(donor_inode, orig_blk_offset, -					      block_len_in_page, 1, err); +		unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, +						 block_len_in_page, 1, err);  		if (*err)  			goto drop_data_sem; -		if (!uninit) { +		if (!unwritten) {  			ext4_double_up_write_data_sem(orig_inode, donor_inode);  			goto data_copy;  		} @@ -1203,42 +1203,6 @@ mext_check_arguments(struct inode *orig_inode,  }  /** - * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 - * - * @inode1:	the inode structure - * @inode2:	the inode structure - * - * Lock two inodes' i_mutex - */ -void -ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) -{ -	BUG_ON(inode1 == inode2); -	if (inode1 < inode2) { -		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); -		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); -	} else { -		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); -		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); -	} -} - -/** - * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 - * - * @inode1:     the inode that is released first - * @inode2:     the inode that is released second - * - */ - -void -ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ -	mutex_unlock(&inode1->i_mutex); -	mutex_unlock(&inode2->i_mutex); -} - -/**   * ext4_move_extents - Exchange the specified range of a file   *   * @o_filp:		file structure of the original file @@ -1296,7 +1260,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,  	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;  	int data_offset_in_page;  	int block_len_in_page; -	int uninit; +	int unwritten;  	if (orig_inode->i_sb != donor_inode->i_sb) {  		ext4_debug("ext4 move extent: The argument files " @@ -1327,7 +1291,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,  		return -EINVAL;  	}  	/* Protect orig and donor inodes against a truncate */ -	ext4_inode_double_lock(orig_inode, donor_inode); +	lock_two_nondirectories(orig_inode, donor_inode);  	/* Wait for all existing dio workers */  	ext4_inode_block_unlocked_dio(orig_inode); @@ -1428,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,  		    !last_extent)  			continue; -		/* Is original extent is uninitialized */ -		uninit = ext4_ext_is_uninitialized(ext_prev); +		/* Is original extent is unwritten */ +		unwritten = ext4_ext_is_unwritten(ext_prev);  		data_offset_in_page = seq_start % blocks_per_page; @@ -1469,8 +1433,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,  						o_filp, donor_inode,  						orig_page_offset,  						data_offset_in_page, -						block_len_in_page, uninit, -						&ret); +						block_len_in_page, +						unwritten, &ret);  			/* Count how many blocks we have exchanged */  			*moved_len += block_len_in_page; @@ -1535,7 +1499,7 @@ out:  	ext4_double_up_write_data_sem(orig_inode, donor_inode);  	ext4_inode_resume_unlocked_dio(orig_inode);  	ext4_inode_resume_unlocked_dio(donor_inode); -	ext4_inode_double_unlock(orig_inode, donor_inode); +	unlock_two_nondirectories(orig_inode, donor_inode);  	return ret;  } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bec5a5c1e4..3520ab8a663 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -67,6 +67,7 @@ static struct buffer_head *ext4_append(handle_t *handle,  		return ERR_PTR(err);  	inode->i_size += inode->i_sb->s_blocksize;  	EXT4_I(inode)->i_disksize = inode->i_size; +	BUFFER_TRACE(bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, bh);  	if (err) {  		brelse(bh); @@ -1425,9 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi  			return ERR_PTR(-EIO);  		}  		if (unlikely(ino == dir->i_ino)) { -			EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", -					 dentry->d_name.len, -					 dentry->d_name.name); +			EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", +					 dentry);  			return ERR_PTR(-EIO);  		}  		inode = ext4_iget(dir->i_sb, ino); @@ -1779,6 +1779,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,  	blocksize =  dir->i_sb->s_blocksize;  	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); +	BUFFER_TRACE(bh, "get_write_access");  	retval = ext4_journal_get_write_access(handle, bh);  	if (retval) {  		ext4_std_error(dir->i_sb, retval); @@ -2319,7 +2320,7 @@ retry:  		d_tmpfile(dentry, inode);  		err = ext4_orphan_add(handle, inode);  		if (err) -			goto err_drop_inode; +			goto err_unlock_inode;  		mark_inode_dirty(inode);  		unlock_new_inode(inode);  	} @@ -2328,10 +2329,9 @@ retry:  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))  		goto retry;  	return err; -err_drop_inode: +err_unlock_inode:  	ext4_journal_stop(handle);  	unlock_new_inode(inode); -	iput(inode);  	return err;  } @@ -2512,8 +2512,7 @@ static int empty_dir(struct inode *inode)  		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);  	de = ext4_next_entry(de1, sb->s_blocksize);  	while (offset < inode->i_size) { -		if (!bh || -		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { +		if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {  			unsigned int lblock;  			err = 0;  			brelse(bh); @@ -2541,26 +2540,37 @@ static int empty_dir(struct inode *inode)  	return 1;  } -/* ext4_orphan_add() links an unlinked or truncated inode into a list of +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of   * such inodes, starting at the superblock, in case we crash before the   * file is closed/deleted, or in case the inode truncate spans multiple   * transactions and the last transaction is not recovered after a crash.   *   * At filesystem recovery time, we walk this list deleting unlinked   * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it.   */  int ext4_orphan_add(handle_t *handle, struct inode *inode)  {  	struct super_block *sb = inode->i_sb; +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	struct ext4_iloc iloc;  	int err = 0, rc; +	bool dirty = false; -	if (!EXT4_SB(sb)->s_journal) +	if (!sbi->s_journal)  		return 0; -	mutex_lock(&EXT4_SB(sb)->s_orphan_lock); +	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && +		     !mutex_is_locked(&inode->i_mutex)); +	/* +	 * Exit early if inode already is on orphan list. This is a big speedup +	 * since we don't have to contend on the global s_orphan_lock. +	 */  	if (!list_empty(&EXT4_I(inode)->i_orphan)) -		goto out_unlock; +		return 0;  	/*  	 * Orphan handling is only valid for files with data blocks @@ -2571,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)  	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||  		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); -	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); +	BUFFER_TRACE(sbi->s_sbh, "get_write_access"); +	err = ext4_journal_get_write_access(handle, sbi->s_sbh);  	if (err) -		goto out_unlock; +		goto out;  	err = ext4_reserve_inode_write(handle, inode, &iloc);  	if (err) -		goto out_unlock; +		goto out; + +	mutex_lock(&sbi->s_orphan_lock);  	/*  	 * Due to previous errors inode may be already a part of on-disk  	 * orphan list. If so skip on-disk list modification.  	 */ -	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= -		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) -			goto mem_insert; - -	/* Insert this inode at the head of the on-disk orphan list... */ -	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); -	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); -	err = ext4_handle_dirty_super(handle, sb); -	rc = ext4_mark_iloc_dirty(handle, inode, &iloc); -	if (!err) -		err = rc; - -	/* Only add to the head of the in-memory list if all the -	 * previous operations succeeded.  If the orphan_add is going to -	 * fail (possibly taking the journal offline), we can't risk -	 * leaving the inode on the orphan list: stray orphan-list -	 * entries can cause panics at unmount time. -	 * -	 * This is safe: on error we're going to ignore the orphan list -	 * anyway on the next recovery. */ -mem_insert: -	if (!err) -		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - +	if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > +	    (le32_to_cpu(sbi->s_es->s_inodes_count))) { +		/* Insert this inode at the head of the on-disk orphan list */ +		NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); +		sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); +		dirty = true; +	} +	list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); +	mutex_unlock(&sbi->s_orphan_lock); + +	if (dirty) { +		err = ext4_handle_dirty_super(handle, sb); +		rc = ext4_mark_iloc_dirty(handle, inode, &iloc); +		if (!err) +			err = rc; +		if (err) { +			/* +			 * We have to remove inode from in-memory list if +			 * addition to on disk orphan list failed. Stray orphan +			 * list entries can cause panics at unmount time. +			 */ +			mutex_lock(&sbi->s_orphan_lock); +			list_del(&EXT4_I(inode)->i_orphan); +			mutex_unlock(&sbi->s_orphan_lock); +		} +	}  	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);  	jbd_debug(4, "orphan inode %lu will point to %d\n",  			inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: -	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); -	ext4_std_error(inode->i_sb, err); +out: +	ext4_std_error(sb, err);  	return err;  } @@ -2624,45 +2637,51 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)  {  	struct list_head *prev;  	struct ext4_inode_info *ei = EXT4_I(inode); -	struct ext4_sb_info *sbi; +	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	__u32 ino_next;  	struct ext4_iloc iloc;  	int err = 0; -	if ((!EXT4_SB(inode->i_sb)->s_journal) && -	    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) +	if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))  		return 0; -	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); +	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && +		     !mutex_is_locked(&inode->i_mutex)); +	/* Do this quick check before taking global s_orphan_lock. */  	if (list_empty(&ei->i_orphan)) -		goto out; +		return 0; -	ino_next = NEXT_ORPHAN(inode); -	prev = ei->i_orphan.prev; -	sbi = EXT4_SB(inode->i_sb); +	if (handle) { +		/* Grab inode buffer early before taking global s_orphan_lock */ +		err = ext4_reserve_inode_write(handle, inode, &iloc); +	} +	mutex_lock(&sbi->s_orphan_lock);  	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); +	prev = ei->i_orphan.prev;  	list_del_init(&ei->i_orphan);  	/* If we're on an error path, we may not have a valid  	 * transaction handle with which to update the orphan list on  	 * disk, but we still need to remove the inode from the linked  	 * list in memory. */ -	if (!handle) -		goto out; - -	err = ext4_reserve_inode_write(handle, inode, &iloc); -	if (err) +	if (!handle || err) { +		mutex_unlock(&sbi->s_orphan_lock);  		goto out_err; +	} +	ino_next = NEXT_ORPHAN(inode);  	if (prev == &sbi->s_orphan) {  		jbd_debug(4, "superblock will point to %u\n", ino_next);  		BUFFER_TRACE(sbi->s_sbh, "get_write_access");  		err = ext4_journal_get_write_access(handle, sbi->s_sbh); -		if (err) +		if (err) { +			mutex_unlock(&sbi->s_orphan_lock);  			goto out_brelse; +		}  		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); +		mutex_unlock(&sbi->s_orphan_lock);  		err = ext4_handle_dirty_super(handle, inode->i_sb);  	} else {  		struct ext4_iloc iloc2; @@ -2672,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)  		jbd_debug(4, "orphan inode %lu will point to %u\n",  			  i_prev->i_ino, ino_next);  		err = ext4_reserve_inode_write(handle, i_prev, &iloc2); -		if (err) +		if (err) { +			mutex_unlock(&sbi->s_orphan_lock);  			goto out_brelse; +		}  		NEXT_ORPHAN(i_prev) = ino_next;  		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); +		mutex_unlock(&sbi->s_orphan_lock);  	}  	if (err)  		goto out_brelse;  	NEXT_ORPHAN(inode) = 0;  	err = ext4_mark_iloc_dirty(handle, inode, &iloc); -  out_err:  	ext4_std_error(inode->i_sb, err); -out: -	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);  	return err;  out_brelse: @@ -3002,6 +3021,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,  	return ext4_get_first_inline_block(inode, parent_de, retval);  } +struct ext4_renament { +	struct inode *dir; +	struct dentry *dentry; +	struct inode *inode; +	bool is_dir; +	int dir_nlink_delta; + +	/* entry for "dentry" */ +	struct buffer_head *bh; +	struct ext4_dir_entry_2 *de; +	int inlined; + +	/* entry for ".." in inode if it's a directory */ +	struct buffer_head *dir_bh; +	struct ext4_dir_entry_2 *parent_de; +	int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +{ +	int retval; + +	ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, +					      &retval, &ent->parent_de, +					      &ent->dir_inlined); +	if (!ent->dir_bh) +		return retval; +	if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) +		return -EIO; +	BUFFER_TRACE(ent->dir_bh, "get_write_access"); +	return ext4_journal_get_write_access(handle, ent->dir_bh); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, +				  unsigned dir_ino) +{ +	int retval; + +	ent->parent_de->inode = cpu_to_le32(dir_ino); +	BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); +	if (!ent->dir_inlined) { +		if (is_dx(ent->inode)) { +			retval = ext4_handle_dirty_dx_node(handle, +							   ent->inode, +							   ent->dir_bh); +		} else { +			retval = ext4_handle_dirty_dirent_node(handle, +							       ent->inode, +							       ent->dir_bh); +		} +	} else { +		retval = ext4_mark_inode_dirty(handle, ent->inode); +	} +	if (retval) { +		ext4_std_error(ent->dir->i_sb, retval); +		return retval; +	} +	return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, +		       unsigned ino, unsigned file_type) +{ +	int retval; + +	BUFFER_TRACE(ent->bh, "get write access"); +	retval = ext4_journal_get_write_access(handle, ent->bh); +	if (retval) +		return retval; +	ent->de->inode = cpu_to_le32(ino); +	if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, +				      EXT4_FEATURE_INCOMPAT_FILETYPE)) +		ent->de->file_type = file_type; +	ent->dir->i_version++; +	ent->dir->i_ctime = ent->dir->i_mtime = +		ext4_current_time(ent->dir); +	ext4_mark_inode_dirty(handle, ent->dir); +	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); +	if (!ent->inlined) { +		retval = ext4_handle_dirty_dirent_node(handle, +						       ent->dir, ent->bh); +		if (unlikely(retval)) { +			ext4_std_error(ent->dir->i_sb, retval); +			return retval; +		} +	} +	brelse(ent->bh); +	ent->bh = NULL; + +	return 0; +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, +				  const struct qstr *d_name) +{ +	int retval = -ENOENT; +	struct buffer_head *bh; +	struct ext4_dir_entry_2 *de; + +	bh = ext4_find_entry(dir, d_name, &de, NULL); +	if (bh) { +		retval = ext4_delete_entry(handle, dir, de, bh); +		brelse(bh); +	} +	return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +{ +	int retval; +	/* +	 * ent->de could have moved from under us during htree split, so make +	 * sure that we are deleting the right entry.  We might also be pointing +	 * to a stale entry in the unused part of ent->bh so just checking inum +	 * and the name isn't enough. +	 */ +	if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || +	    ent->de->name_len != ent->dentry->d_name.len || +	    strncmp(ent->de->name, ent->dentry->d_name.name, +		    ent->de->name_len)) { +		retval = ext4_find_delete_entry(handle, ent->dir, +						&ent->dentry->d_name); +	} else { +		retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); +		if (retval == -ENOENT) { +			retval = ext4_find_delete_entry(handle, ent->dir, +							&ent->dentry->d_name); +		} +	} + +	if (retval) { +		ext4_warning(ent->dir->i_sb, +				"Deleting old file (%lu), %d, error=%d", +				ent->dir->i_ino, ent->dir->i_nlink, retval); +	} +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ +	if (ent->dir_nlink_delta) { +		if (ent->dir_nlink_delta == -1) +			ext4_dec_count(handle, ent->dir); +		else +			ext4_inc_count(handle, ent->dir); +		ext4_mark_inode_dirty(handle, ent->dir); +	} +} +  /*   * Anybody can rename anything with this: the permission checks are left to the   * higher-level routines. @@ -3014,198 +3181,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,  		       struct inode *new_dir, struct dentry *new_dentry)  {  	handle_t *handle = NULL; -	struct inode *old_inode, *new_inode; -	struct buffer_head *old_bh, *new_bh, *dir_bh; -	struct ext4_dir_entry_2 *old_de, *new_de; +	struct ext4_renament old = { +		.dir = old_dir, +		.dentry = old_dentry, +		.inode = old_dentry->d_inode, +	}; +	struct ext4_renament new = { +		.dir = new_dir, +		.dentry = new_dentry, +		.inode = new_dentry->d_inode, +	};  	int retval; -	int inlined = 0, new_inlined = 0; -	struct ext4_dir_entry_2 *parent_de; -	dquot_initialize(old_dir); -	dquot_initialize(new_dir); - -	old_bh = new_bh = dir_bh = NULL; +	dquot_initialize(old.dir); +	dquot_initialize(new.dir);  	/* Initialize quotas before so that eventual writes go  	 * in separate transaction */ -	if (new_dentry->d_inode) -		dquot_initialize(new_dentry->d_inode); +	if (new.inode) +		dquot_initialize(new.inode); -	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); +	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);  	/*  	 *  Check for inode number is _not_ due to possible IO errors.  	 *  We might rmdir the source, keep it as pwd of some process  	 *  and merrily kill the link to whatever was created under the  	 *  same name. Goodbye sticky bit ;-<  	 */ -	old_inode = old_dentry->d_inode;  	retval = -ENOENT; -	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) +	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)  		goto end_rename; -	new_inode = new_dentry->d_inode; -	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, -				 &new_de, &new_inlined); -	if (new_bh) { -		if (!new_inode) { -			brelse(new_bh); -			new_bh = NULL; +	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, +				 &new.de, &new.inlined); +	if (new.bh) { +		if (!new.inode) { +			brelse(new.bh); +			new.bh = NULL;  		}  	} -	if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) -		ext4_alloc_da_blocks(old_inode); +	if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) +		ext4_alloc_da_blocks(old.inode); -	handle = ext4_journal_start(old_dir, EXT4_HT_DIR, -		(2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + +	handle = ext4_journal_start(old.dir, EXT4_HT_DIR, +		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +  		 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));  	if (IS_ERR(handle))  		return PTR_ERR(handle); -	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) +	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))  		ext4_handle_sync(handle); -	if (S_ISDIR(old_inode->i_mode)) { -		if (new_inode) { +	if (S_ISDIR(old.inode->i_mode)) { +		if (new.inode) {  			retval = -ENOTEMPTY; -			if (!empty_dir(new_inode)) +			if (!empty_dir(new.inode)) +				goto end_rename; +		} else { +			retval = -EMLINK; +			if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))  				goto end_rename;  		} -		retval = -EIO; -		dir_bh = ext4_get_first_dir_block(handle, old_inode, -						  &retval, &parent_de, -						  &inlined); -		if (!dir_bh) -			goto end_rename; -		if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) -			goto end_rename; -		retval = -EMLINK; -		if (!new_inode && new_dir != old_dir && -		    EXT4_DIR_LINK_MAX(new_dir)) -			goto end_rename; -		BUFFER_TRACE(dir_bh, "get_write_access"); -		retval = ext4_journal_get_write_access(handle, dir_bh); +		retval = ext4_rename_dir_prepare(handle, &old);  		if (retval)  			goto end_rename;  	} -	if (!new_bh) { -		retval = ext4_add_entry(handle, new_dentry, old_inode); +	if (!new.bh) { +		retval = ext4_add_entry(handle, new.dentry, old.inode);  		if (retval)  			goto end_rename;  	} else { -		BUFFER_TRACE(new_bh, "get write access"); -		retval = ext4_journal_get_write_access(handle, new_bh); +		retval = ext4_setent(handle, &new, +				     old.inode->i_ino, old.de->file_type);  		if (retval)  			goto end_rename; -		new_de->inode = cpu_to_le32(old_inode->i_ino); -		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, -					      EXT4_FEATURE_INCOMPAT_FILETYPE)) -			new_de->file_type = old_de->file_type; -		new_dir->i_version++; -		new_dir->i_ctime = new_dir->i_mtime = -					ext4_current_time(new_dir); -		ext4_mark_inode_dirty(handle, new_dir); -		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); -		if (!new_inlined) { -			retval = ext4_handle_dirty_dirent_node(handle, -							       new_dir, new_bh); -			if (unlikely(retval)) { -				ext4_std_error(new_dir->i_sb, retval); -				goto end_rename; -			} -		} -		brelse(new_bh); -		new_bh = NULL;  	}  	/*  	 * Like most other Unix systems, set the ctime for inodes on a  	 * rename.  	 */ -	old_inode->i_ctime = ext4_current_time(old_inode); -	ext4_mark_inode_dirty(handle, old_inode); +	old.inode->i_ctime = ext4_current_time(old.inode); +	ext4_mark_inode_dirty(handle, old.inode);  	/*  	 * ok, that's it  	 */ -	if (le32_to_cpu(old_de->inode) != old_inode->i_ino || -	    old_de->name_len != old_dentry->d_name.len || -	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || -	    (retval = ext4_delete_entry(handle, old_dir, -					old_de, old_bh)) == -ENOENT) { -		/* old_de could have moved from under us during htree split, so -		 * make sure that we are deleting the right entry.  We might -		 * also be pointing to a stale entry in the unused part of -		 * old_bh so just checking inum and the name isn't enough. */ -		struct buffer_head *old_bh2; -		struct ext4_dir_entry_2 *old_de2; - -		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, -					  &old_de2, NULL); -		if (old_bh2) { -			retval = ext4_delete_entry(handle, old_dir, -						   old_de2, old_bh2); -			brelse(old_bh2); -		} +	ext4_rename_delete(handle, &old); + +	if (new.inode) { +		ext4_dec_count(handle, new.inode); +		new.inode->i_ctime = ext4_current_time(new.inode);  	} -	if (retval) { -		ext4_warning(old_dir->i_sb, -				"Deleting old file (%lu), %d, error=%d", -				old_dir->i_ino, old_dir->i_nlink, retval); -	} - -	if (new_inode) { -		ext4_dec_count(handle, new_inode); -		new_inode->i_ctime = ext4_current_time(new_inode); -	} -	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); -	ext4_update_dx_flag(old_dir); -	if (dir_bh) { -		parent_de->inode = cpu_to_le32(new_dir->i_ino); -		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); -		if (!inlined) { -			if (is_dx(old_inode)) { -				retval = ext4_handle_dirty_dx_node(handle, -								   old_inode, -								   dir_bh); -			} else { -				retval = ext4_handle_dirty_dirent_node(handle, -							old_inode, dir_bh); -			} -		} else { -			retval = ext4_mark_inode_dirty(handle, old_inode); -		} -		if (retval) { -			ext4_std_error(old_dir->i_sb, retval); +	old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); +	ext4_update_dx_flag(old.dir); +	if (old.dir_bh) { +		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); +		if (retval)  			goto end_rename; -		} -		ext4_dec_count(handle, old_dir); -		if (new_inode) { + +		ext4_dec_count(handle, old.dir); +		if (new.inode) {  			/* checked empty_dir above, can't have another parent,  			 * ext4_dec_count() won't work for many-linked dirs */ -			clear_nlink(new_inode); +			clear_nlink(new.inode);  		} else { -			ext4_inc_count(handle, new_dir); -			ext4_update_dx_flag(new_dir); -			ext4_mark_inode_dirty(handle, new_dir); +			ext4_inc_count(handle, new.dir); +			ext4_update_dx_flag(new.dir); +			ext4_mark_inode_dirty(handle, new.dir);  		}  	} -	ext4_mark_inode_dirty(handle, old_dir); -	if (new_inode) { -		ext4_mark_inode_dirty(handle, new_inode); -		if (!new_inode->i_nlink) -			ext4_orphan_add(handle, new_inode); +	ext4_mark_inode_dirty(handle, old.dir); +	if (new.inode) { +		ext4_mark_inode_dirty(handle, new.inode); +		if (!new.inode->i_nlink) +			ext4_orphan_add(handle, new.inode); +	} +	retval = 0; + +end_rename: +	brelse(old.dir_bh); +	brelse(old.bh); +	brelse(new.bh); +	if (handle) +		ext4_journal_stop(handle); +	return retval; +} + +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, +			     struct inode *new_dir, struct dentry *new_dentry) +{ +	handle_t *handle = NULL; +	struct ext4_renament old = { +		.dir = old_dir, +		.dentry = old_dentry, +		.inode = old_dentry->d_inode, +	}; +	struct ext4_renament new = { +		.dir = new_dir, +		.dentry = new_dentry, +		.inode = new_dentry->d_inode, +	}; +	u8 new_file_type; +	int retval; + +	dquot_initialize(old.dir); +	dquot_initialize(new.dir); + +	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, +				 &old.de, &old.inlined); +	/* +	 *  Check for inode number is _not_ due to possible IO errors. +	 *  We might rmdir the source, keep it as pwd of some process +	 *  and merrily kill the link to whatever was created under the +	 *  same name. Goodbye sticky bit ;-< +	 */ +	retval = -ENOENT; +	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) +		goto end_rename; + +	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, +				 &new.de, &new.inlined); + +	/* RENAME_EXCHANGE case: old *and* new must both exist */ +	if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) +		goto end_rename; + +	handle = ext4_journal_start(old.dir, EXT4_HT_DIR, +		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + +		 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); +	if (IS_ERR(handle)) +		return PTR_ERR(handle); + +	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) +		ext4_handle_sync(handle); + +	if (S_ISDIR(old.inode->i_mode)) { +		old.is_dir = true; +		retval = ext4_rename_dir_prepare(handle, &old); +		if (retval) +			goto end_rename; +	} +	if (S_ISDIR(new.inode->i_mode)) { +		new.is_dir = true; +		retval = ext4_rename_dir_prepare(handle, &new); +		if (retval) +			goto end_rename;  	} + +	/* +	 * Other than the special case of overwriting a directory, parents' +	 * nlink only needs to be modified if this is a cross directory rename. +	 */ +	if (old.dir != new.dir && old.is_dir != new.is_dir) { +		old.dir_nlink_delta = old.is_dir ? -1 : 1; +		new.dir_nlink_delta = -old.dir_nlink_delta; +		retval = -EMLINK; +		if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || +		    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) +			goto end_rename; +	} + +	new_file_type = new.de->file_type; +	retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); +	if (retval) +		goto end_rename; + +	retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); +	if (retval) +		goto end_rename; + +	/* +	 * Like most other Unix systems, set the ctime for inodes on a +	 * rename. +	 */ +	old.inode->i_ctime = ext4_current_time(old.inode); +	new.inode->i_ctime = ext4_current_time(new.inode); +	ext4_mark_inode_dirty(handle, old.inode); +	ext4_mark_inode_dirty(handle, new.inode); + +	if (old.dir_bh) { +		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); +		if (retval) +			goto end_rename; +	} +	if (new.dir_bh) { +		retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); +		if (retval) +			goto end_rename; +	} +	ext4_update_dir_count(handle, &old); +	ext4_update_dir_count(handle, &new);  	retval = 0;  end_rename: -	brelse(dir_bh); -	brelse(old_bh); -	brelse(new_bh); +	brelse(old.dir_bh); +	brelse(new.dir_bh); +	brelse(old.bh); +	brelse(new.bh);  	if (handle)  		ext4_journal_stop(handle);  	return retval;  } +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, +			struct inode *new_dir, struct dentry *new_dentry, +			unsigned int flags) +{ +	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) +		return -EINVAL; + +	if (flags & RENAME_EXCHANGE) { +		return ext4_cross_rename(old_dir, old_dentry, +					 new_dir, new_dentry); +	} +	/* +	 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" +	 * is equivalent to regular rename. +	 */ +	return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); +} +  /*   * directories can handle most operations...   */ @@ -3220,12 +3456,14 @@ const struct inode_operations ext4_dir_inode_operations = {  	.mknod		= ext4_mknod,  	.tmpfile	= ext4_tmpfile,  	.rename		= ext4_rename, +	.rename2	= ext4_rename2,  	.setattr	= ext4_setattr,  	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr	= ext4_listxattr,  	.removexattr	= generic_removexattr,  	.get_acl	= ext4_get_acl, +	.set_acl	= ext4_set_acl,  	.fiemap         = ext4_fiemap,  }; @@ -3236,4 +3474,5 @@ const struct inode_operations ext4_special_inode_operations = {  	.listxattr	= ext4_listxattr,  	.removexattr	= generic_removexattr,  	.get_acl	= ext4_get_acl, +	.set_acl	= ext4_set_acl,  }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d7d0c7b46ed..b24a2541a9b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)  {  	int i;  	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct bio_vec *bvec; -	for (i = 0; i < bio->bi_vcnt; i++) { -		struct bio_vec *bvec = &bio->bi_io_vec[i]; +	bio_for_each_segment_all(bvec, bio, i) {  		struct page *page = bvec->bv_page;  		struct buffer_head *bh, *head;  		unsigned bio_start = bvec->bv_offset; @@ -197,14 +197,15 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)  static void ext4_add_complete_io(ext4_io_end_t *io_end)  {  	struct ext4_inode_info *ei = EXT4_I(io_end->inode); +	struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);  	struct workqueue_struct *wq;  	unsigned long flags;  	/* Only reserved conversions from writeback should enter here */  	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); -	WARN_ON(!io_end->handle); +	WARN_ON(!io_end->handle && sbi->s_journal);  	spin_lock_irqsave(&ei->i_completed_io_lock, flags); -	wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; +	wq = sbi->rsv_conversion_wq;  	if (list_empty(&ei->i_rsv_conversion_list))  		queue_work(wq, &ei->i_rsv_conversion_work);  	list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); @@ -297,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)  static void ext4_end_bio(struct bio *bio, int error)  {  	ext4_io_end_t *io_end = bio->bi_private; -	sector_t bi_sector = bio->bi_sector; +	sector_t bi_sector = bio->bi_iter.bi_sector;  	BUG_ON(!io_end);  	bio->bi_end_io = NULL; @@ -307,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)  	if (error) {  		struct inode *inode = io_end->inode; -		ext4_warning(inode->i_sb, "I/O error writing to inode %lu " +		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "  			     "(offset %llu size %ld starting block %llu)", -			     inode->i_ino, +			     error, inode->i_ino,  			     (unsigned long long) io_end->offset,  			     (long) io_end->size,  			     (unsigned long long)  			     bi_sector >> (inode->i_blkbits - 9)); +		mapping_set_error(inode->i_mapping, error);  	}  	if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -365,7 +367,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,  	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));  	if (!bio)  		return -ENOMEM; -	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); +	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);  	bio->bi_bdev = bh->b_bdev;  	bio->bi_end_io = ext4_end_bio;  	bio->bi_private = ext4_get_io_end(io->io_end); @@ -399,7 +401,8 @@ submit_and_retry:  int ext4_bio_write_page(struct ext4_io_submit *io,  			struct page *page,  			int len, -			struct writeback_control *wbc) +			struct writeback_control *wbc, +			bool keep_towrite)  {  	struct inode *inode = page->mapping->host;  	unsigned block_start, blocksize; @@ -412,10 +415,24 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	BUG_ON(!PageLocked(page));  	BUG_ON(PageWriteback(page)); -	set_page_writeback(page); +	if (keep_towrite) +		set_page_writeback_keepwrite(page); +	else +		set_page_writeback(page);  	ClearPageError(page);  	/* +	 * Comments copied from block_write_full_page: +	 * +	 * The page straddles i_size.  It must be zeroed out on each and every +	 * writepage invocation because it may be mmapped.  "A file is mapped +	 * in multiples of the page size.  For a file that is not a multiple of +	 * the page size, the remaining memory is zeroed when mapped, and +	 * writes to that region are not written out to the file." +	 */ +	if (len < PAGE_CACHE_SIZE) +		zero_user_segment(page, len, PAGE_CACHE_SIZE); +	/*  	 * In the first loop we prepare and mark buffers to submit. We have to  	 * mark all buffers in the page before submitting so that  	 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO @@ -426,19 +443,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	do {  		block_start = bh_offset(bh);  		if (block_start >= len) { -			/* -			 * Comments copied from block_write_full_page_endio: -			 * -			 * The page straddles i_size.  It must be zeroed out on -			 * each and every writepage invocation because it may -			 * be mmapped.  "A file is mapped in multiples of the -			 * page size.  For a file that is not a multiple of -			 * the  page size, the remaining memory is zeroed when -			 * mapped, and writes to that region are not written -			 * out to the file." -			 */ -			zero_user_segment(page, block_start, -					  block_start + blocksize);  			clear_buffer_dirty(bh);  			set_buffer_uptodate(bh);  			continue; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c5adbb318a9..bb0e80f03e2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb)  void ext4_resize_end(struct super_block *sb)  {  	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  }  static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, @@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,  	ext4_group_t group;  	ext4_group_t last_group;  	unsigned overhead; +	__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;  	BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -266,7 +267,7 @@ next_group:  	src_group++;  	for (; src_group <= last_group; src_group++) {  		overhead = ext4_group_overhead_blocks(sb, src_group); -		if (overhead != 0) +		if (overhead == 0)  			last_blk += group_data[src_group - group].blocks_count;  		else  			break; @@ -280,8 +281,7 @@ next_group:  		group = ext4_get_group_number(sb, start_blk - 1);  		group -= group_data[0].group;  		group_data[group].free_blocks_count--; -		if (flexbg_size > 1) -			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; +		flex_gd->bg_flags[group] &= uninit_mask;  	}  	/* Allocate inode bitmaps */ @@ -292,22 +292,30 @@ next_group:  		group = ext4_get_group_number(sb, start_blk - 1);  		group -= group_data[0].group;  		group_data[group].free_blocks_count--; -		if (flexbg_size > 1) -			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; +		flex_gd->bg_flags[group] &= uninit_mask;  	}  	/* Allocate inode tables */  	for (; it_index < flex_gd->count; it_index++) { -		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) +		unsigned int itb = EXT4_SB(sb)->s_itb_per_group; +		ext4_fsblk_t next_group_start; + +		if (start_blk + itb > last_blk)  			goto next_group;  		group_data[it_index].inode_table = start_blk; -		group = ext4_get_group_number(sb, start_blk - 1); +		group = ext4_get_group_number(sb, start_blk); +		next_group_start = ext4_group_first_block_no(sb, group + 1);  		group -= group_data[0].group; -		group_data[group].free_blocks_count -= -					EXT4_SB(sb)->s_itb_per_group; -		if (flexbg_size > 1) -			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; +		if (start_blk + itb > next_group_start) { +			flex_gd->bg_flags[group + 1] &= uninit_mask; +			overhead = start_blk + itb - next_group_start; +			group_data[group + 1].free_blocks_count -= overhead; +			itb -= overhead; +		} + +		group_data[group].free_blocks_count -= itb; +		flex_gd->bg_flags[group] &= uninit_mask;  		start_blk += EXT4_SB(sb)->s_itb_per_group;  	} @@ -340,6 +348,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,  	bh = sb_getblk(sb, blk);  	if (unlikely(!bh))  		return ERR_PTR(-ENOMEM); +	BUFFER_TRACE(bh, "get_write_access");  	if ((err = ext4_journal_get_write_access(handle, bh))) {  		brelse(bh);  		bh = ERR_PTR(err); @@ -401,7 +410,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,  		start = ext4_group_first_block_no(sb, group);  		group -= flex_gd->groups[0].group; -		count2 = sb->s_blocksize * 8 - (block - start); +		count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);  		if (count2 > count)  			count2 = count; @@ -418,6 +427,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,  		if (unlikely(!bh))  			return -ENOMEM; +		BUFFER_TRACE(bh, "get_write_access");  		err = ext4_journal_get_write_access(handle, bh);  		if (err)  			return err; @@ -510,6 +520,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,  				goto out;  			} +			BUFFER_TRACE(gdb, "get_write_access");  			err = ext4_journal_get_write_access(handle, gdb);  			if (err) {  				brelse(gdb); @@ -620,7 +631,7 @@ handle_ib:  			if (err)  				goto out;  			count = group_table_count[j]; -			start = group_data[i].block_bitmap; +			start = (&group_data[i].block_bitmap)[j];  			block = start;  		} @@ -782,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,  		goto exit_dind;  	} +	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");  	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);  	if (unlikely(err))  		goto exit_dind; +	BUFFER_TRACE(gdb_bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, gdb_bh);  	if (unlikely(err))  		goto exit_dind; +	BUFFER_TRACE(dind, "get_write_access");  	err = ext4_journal_get_write_access(handle, dind);  	if (unlikely(err))  		ext4_std_error(sb, err); @@ -894,6 +908,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,  	EXT4_SB(sb)->s_group_desc = n_group_desc;  	EXT4_SB(sb)->s_gdb_count++;  	ext4_kvfree(o_group_desc); +	BUFFER_TRACE(gdb_bh, "get_write_access");  	err = ext4_journal_get_write_access(handle, gdb_bh);  	if (unlikely(err))  		brelse(gdb_bh); @@ -969,6 +984,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,  	}  	for (i = 0; i < reserved_gdb; i++) { +		BUFFER_TRACE(primary[i], "get_write_access");  		if ((err = ext4_journal_get_write_access(handle, primary[i])))  			goto exit_bh;  	} @@ -1076,6 +1092,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,  		ext4_debug("update metadata backup %llu(+%llu)\n",  			   backup_block, backup_block -  			   ext4_group_first_block_no(sb, group)); +		BUFFER_TRACE(bh, "get_write_access");  		if ((err = ext4_journal_get_write_access(handle, bh)))  			break;  		lock_buffer(bh); @@ -1155,6 +1172,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,  		 */  		if (gdb_off) {  			gdb_bh = sbi->s_group_desc[gdb_num]; +			BUFFER_TRACE(gdb_bh, "get_write_access");  			err = ext4_journal_get_write_access(handle, gdb_bh);  			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) @@ -1425,6 +1443,7 @@ static int ext4_flex_group_add(struct super_block *sb,  		goto exit;  	} +	BUFFER_TRACE(sbi->s_sbh, "get_write_access");  	err = ext4_journal_get_write_access(handle, sbi->s_sbh);  	if (err)  		goto exit_journal; @@ -1637,6 +1656,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,  		return err;  	} +	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");  	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);  	if (err) {  		ext4_warning(sb, "error %d on journal write access", err); @@ -1796,6 +1816,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)  	if (IS_ERR(handle))  		return PTR_ERR(handle); +	BUFFER_TRACE(sbi->s_sbh, "get_write_access");  	err = ext4_journal_get_write_access(handle, sbi->s_sbh);  	if (err)  		goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2c2e6cbc6be..6df7bc611db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,6 +59,7 @@ static struct kset *ext4_kset;  static struct ext4_lazy_init *ext4_li_info;  static struct mutex ext4_li_mtx;  static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready;  static int ext4_load_journal(struct super_block *, struct ext4_super_block *,  			     unsigned long journal_devnum); @@ -137,8 +138,8 @@ static __le32 ext4_superblock_csum(struct super_block *sb,  	return cpu_to_le32(csum);  } -int ext4_superblock_csum_verify(struct super_block *sb, -				struct ext4_super_block *es) +static int ext4_superblock_csum_verify(struct super_block *sb, +				       struct ext4_super_block *es)  {  	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,  				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -411,20 +412,26 @@ static void ext4_handle_error(struct super_block *sb)  			sb->s_id);  } +#define ext4_error_ratelimit(sb)					\ +		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\ +			     "EXT4-fs error") +  void __ext4_error(struct super_block *sb, const char *function,  		  unsigned int line, const char *fmt, ...)  {  	struct va_format vaf;  	va_list args; -	va_start(args, fmt); -	vaf.fmt = fmt; -	vaf.va = &args; -	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", -	       sb->s_id, function, line, current->comm, &vaf); -	va_end(args); +	if (ext4_error_ratelimit(sb)) { +		va_start(args, fmt); +		vaf.fmt = fmt; +		vaf.va = &args; +		printk(KERN_CRIT +		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", +		       sb->s_id, function, line, current->comm, &vaf); +		va_end(args); +	}  	save_error_info(sb, function, line); -  	ext4_handle_error(sb);  } @@ -438,22 +445,23 @@ void __ext4_error_inode(struct inode *inode, const char *function,  	es->s_last_error_ino = cpu_to_le32(inode->i_ino);  	es->s_last_error_block = cpu_to_le64(block); +	if (ext4_error_ratelimit(inode->i_sb)) { +		va_start(args, fmt); +		vaf.fmt = fmt; +		vaf.va = &args; +		if (block) +			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " +			       "inode #%lu: block %llu: comm %s: %pV\n", +			       inode->i_sb->s_id, function, line, inode->i_ino, +			       block, current->comm, &vaf); +		else +			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " +			       "inode #%lu: comm %s: %pV\n", +			       inode->i_sb->s_id, function, line, inode->i_ino, +			       current->comm, &vaf); +		va_end(args); +	}  	save_error_info(inode->i_sb, function, line); -	va_start(args, fmt); -	vaf.fmt = fmt; -	vaf.va = &args; -	if (block) -		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " -		       "inode #%lu: block %llu: comm %s: %pV\n", -		       inode->i_sb->s_id, function, line, inode->i_ino, -		       block, current->comm, &vaf); -	else -		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " -		       "inode #%lu: comm %s: %pV\n", -		       inode->i_sb->s_id, function, line, inode->i_ino, -		       current->comm, &vaf); -	va_end(args); -  	ext4_handle_error(inode->i_sb);  } @@ -469,27 +477,28 @@ void __ext4_error_file(struct file *file, const char *function,  	es = EXT4_SB(inode->i_sb)->s_es;  	es->s_last_error_ino = cpu_to_le32(inode->i_ino); +	if (ext4_error_ratelimit(inode->i_sb)) { +		path = d_path(&(file->f_path), pathname, sizeof(pathname)); +		if (IS_ERR(path)) +			path = "(unknown)"; +		va_start(args, fmt); +		vaf.fmt = fmt; +		vaf.va = &args; +		if (block) +			printk(KERN_CRIT +			       "EXT4-fs error (device %s): %s:%d: inode #%lu: " +			       "block %llu: comm %s: path %s: %pV\n", +			       inode->i_sb->s_id, function, line, inode->i_ino, +			       block, current->comm, path, &vaf); +		else +			printk(KERN_CRIT +			       "EXT4-fs error (device %s): %s:%d: inode #%lu: " +			       "comm %s: path %s: %pV\n", +			       inode->i_sb->s_id, function, line, inode->i_ino, +			       current->comm, path, &vaf); +		va_end(args); +	}  	save_error_info(inode->i_sb, function, line); -	path = d_path(&(file->f_path), pathname, sizeof(pathname)); -	if (IS_ERR(path)) -		path = "(unknown)"; -	va_start(args, fmt); -	vaf.fmt = fmt; -	vaf.va = &args; -	if (block) -		printk(KERN_CRIT -		       "EXT4-fs error (device %s): %s:%d: inode #%lu: " -		       "block %llu: comm %s: path %s: %pV\n", -		       inode->i_sb->s_id, function, line, inode->i_ino, -		       block, current->comm, path, &vaf); -	else -		printk(KERN_CRIT -		       "EXT4-fs error (device %s): %s:%d: inode #%lu: " -		       "comm %s: path %s: %pV\n", -		       inode->i_sb->s_id, function, line, inode->i_ino, -		       current->comm, path, &vaf); -	va_end(args); -  	ext4_handle_error(inode->i_sb);  } @@ -543,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function,  	    (sb->s_flags & MS_RDONLY))  		return; -	errstr = ext4_decode_error(sb, errno, nbuf); -	printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", -	       sb->s_id, function, line, errstr); -	save_error_info(sb, function, line); +	if (ext4_error_ratelimit(sb)) { +		errstr = ext4_decode_error(sb, errno, nbuf); +		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", +		       sb->s_id, function, line, errstr); +	} +	save_error_info(sb, function, line);  	ext4_handle_error(sb);  } @@ -597,6 +608,9 @@ void __ext4_msg(struct super_block *sb,  	struct va_format vaf;  	va_list args; +	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) +		return; +  	va_start(args, fmt);  	vaf.fmt = fmt;  	vaf.va = &args; @@ -610,6 +624,10 @@ void __ext4_warning(struct super_block *sb, const char *function,  	struct va_format vaf;  	va_list args; +	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), +			  "EXT4-fs warning")) +		return; +  	va_start(args, fmt);  	vaf.fmt = fmt;  	vaf.va = &args; @@ -633,18 +651,20 @@ __acquires(bitlock)  	es->s_last_error_block = cpu_to_le64(block);  	__save_error_info(sb, function, line); -	va_start(args, fmt); - -	vaf.fmt = fmt; -	vaf.va = &args; -	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", -	       sb->s_id, function, line, grp); -	if (ino) -		printk(KERN_CONT "inode %lu: ", ino); -	if (block) -		printk(KERN_CONT "block %llu:", (unsigned long long) block); -	printk(KERN_CONT "%pV\n", &vaf); -	va_end(args); +	if (ext4_error_ratelimit(sb)) { +		va_start(args, fmt); +		vaf.fmt = fmt; +		vaf.va = &args; +		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", +		       sb->s_id, function, line, grp); +		if (ino) +			printk(KERN_CONT "inode %lu: ", ino); +		if (block) +			printk(KERN_CONT "block %llu:", +			       (unsigned long long) block); +		printk(KERN_CONT "%pV\n", &vaf); +		va_end(args); +	}  	if (test_opt(sb, ERRORS_CONT)) {  		ext4_commit_super(sb, 0); @@ -773,7 +793,7 @@ static void ext4_put_super(struct super_block *sb)  	}  	ext4_es_unregister_shrinker(sbi); -	del_timer(&sbi->s_err_report); +	del_timer_sync(&sbi->s_err_report);  	ext4_release_system_zone(sb);  	ext4_mb_release(sb);  	ext4_ext_release(sb); @@ -826,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)  		invalidate_bdev(sbi->journal_bdev);  		ext4_blkdev_remove(sbi);  	} +	if (sbi->s_mb_cache) { +		ext4_xattr_destroy_cache(sbi->s_mb_cache); +		sbi->s_mb_cache = NULL; +	}  	if (sbi->s_mmp_tsk)  		kthread_stop(sbi->s_mmp_tsk);  	sb->s_fs_info = NULL; @@ -855,6 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  		return NULL;  	ei->vfs_inode.i_version = 1; +	spin_lock_init(&ei->i_raw_lock);  	INIT_LIST_HEAD(&ei->i_prealloc_list);  	spin_lock_init(&ei->i_prealloc_lock);  	ext4_es_init_tree(&ei->i_es_tree); @@ -921,7 +946,7 @@ static void init_once(void *foo)  	inode_init_once(&ei->vfs_inode);  } -static int init_inodecache(void) +static int __init init_inodecache(void)  {  	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",  					     sizeof(struct ext4_inode_info), @@ -1500,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,  			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;  		sbi->s_commit_interval = HZ * arg;  	} else if (token == Opt_max_batch_time) { -		if (arg == 0) -			arg = EXT4_DEF_MAX_BATCH_TIME;  		sbi->s_max_batch_time = arg;  	} else if (token == Opt_min_batch_time) {  		sbi->s_min_batch_time = arg; @@ -1879,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,  	if (!(sbi->s_mount_state & EXT4_VALID_FS))  		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "  			 "running e2fsck is recommended"); -	else if ((sbi->s_mount_state & EXT4_ERROR_FS)) +	else if (sbi->s_mount_state & EXT4_ERROR_FS)  		ext4_msg(sb, KERN_WARNING,  			 "warning: mounting fs with errors, "  			 "running e2fsck is recommended"); @@ -2380,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,  	if (ext4_bg_has_super(sb, bg))  		has_super = 1; +	/* +	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at +	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled +	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must +	 * compensate. +	 */ +	if (sb->s_blocksize == 1024 && nr == 0 && +	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) +		has_super++; +  	return (has_super + ext4_group_first_block_no(sb, bg));  } @@ -2606,6 +2639,12 @@ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);  EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);  static struct attribute *ext4_attrs[] = {  	ATTR_LIST(delayed_allocation_blocks), @@ -2623,6 +2662,12 @@ static struct attribute *ext4_attrs[] = {  	ATTR_LIST(max_writeback_mb_bump),  	ATTR_LIST(extent_max_zeroout_kb),  	ATTR_LIST(trigger_fs_error), +	ATTR_LIST(err_ratelimit_interval_ms), +	ATTR_LIST(err_ratelimit_burst), +	ATTR_LIST(warning_ratelimit_interval_ms), +	ATTR_LIST(warning_ratelimit_burst), +	ATTR_LIST(msg_ratelimit_interval_ms), +	ATTR_LIST(msg_ratelimit_burst),  	NULL,  }; @@ -2762,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg)  	es = sbi->s_es;  	if (es->s_error_count) -		ext4_msg(sb, KERN_NOTICE, "error count: %u", +		/* fsck newer than v1.41.13 is needed to clean this condition. */ +		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",  			 le32_to_cpu(es->s_error_count));  	if (es->s_first_error_time) { -		printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", +		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",  		       sb->s_id, le32_to_cpu(es->s_first_error_time),  		       (int) sizeof(es->s_first_error_func),  		       es->s_first_error_func, @@ -2779,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg)  		printk("\n");  	}  	if (es->s_last_error_time) { -		printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", +		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",  		       sb->s_id, le32_to_cpu(es->s_last_error_time),  		       (int) sizeof(es->s_last_error_func),  		       es->s_last_error_func, @@ -3037,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,  {  	struct ext4_sb_info *sbi = EXT4_SB(sb);  	struct ext4_li_request *elr; -	unsigned long rnd;  	elr = kzalloc(sizeof(*elr), GFP_KERNEL);  	if (!elr) @@ -3052,10 +3097,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,  	 * spread the inode table initialization requests  	 * better.  	 */ -	get_random_bytes(&rnd, sizeof(rnd)); -	elr->lr_next_sched = jiffies + (unsigned long)rnd % -			     (EXT4_DEF_LI_MAX_START_DELAY * HZ); - +	elr->lr_next_sched = jiffies + (prandom_u32() % +				(EXT4_DEF_LI_MAX_START_DELAY * HZ));  	return elr;  } @@ -3288,19 +3331,28 @@ int ext4_calculate_overhead(struct super_block *sb)  } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)  {  	ext4_fsblk_t resv_clusters;  	/* +	 * There's no need to reserve anything when we aren't using extents. +	 * The space estimates are exact, there are no unwritten extents, +	 * hole punching doesn't need new metadata... This is needed especially +	 * to keep ext2/3 backward compatibility. +	 */ +	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) +		return 0; +	/*  	 * By default we reserve 2% or 4096 clusters, whichever is smaller.  	 * This should cover the situations where we can not afford to run  	 * out of space like for example punch hole, or converting -	 * uninitialized extents in delalloc path. In most cases such +	 * unwritten extents in delalloc path. In most cases such  	 * allocation would require 1, or 2 blocks, higher numbers are  	 * very rare.  	 */ -	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; +	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> +			EXT4_SB(sb)->s_cluster_bits;  	do_div(resv_clusters, 50);  	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -3538,6 +3590,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		       "feature flags set on rev 0 fs, "  		       "running e2fsck is recommended"); +	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { +		set_opt2(sb, HURD_COMPAT); +		if (EXT4_HAS_INCOMPAT_FEATURE(sb, +					      EXT4_FEATURE_INCOMPAT_64BIT)) { +			ext4_msg(sb, KERN_ERR, +				 "The Hurd can't support 64-bit file systems"); +			goto failed_mount; +		} +	} +  	if (IS_EXT2_SB(sb)) {  		if (ext2_feature_set_ok(sb))  			ext4_msg(sb, KERN_INFO, "mounting ext2 file system " @@ -3658,16 +3720,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	for (i = 0; i < 4; i++)  		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);  	sbi->s_def_hash_version = es->s_def_hash_version; -	i = le32_to_cpu(es->s_flags); -	if (i & EXT2_FLAGS_UNSIGNED_HASH) -		sbi->s_hash_unsigned = 3; -	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { +		i = le32_to_cpu(es->s_flags); +		if (i & EXT2_FLAGS_UNSIGNED_HASH) +			sbi->s_hash_unsigned = 3; +		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {  #ifdef __CHAR_UNSIGNED__ -		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); -		sbi->s_hash_unsigned = 3; +			if (!(sb->s_flags & MS_RDONLY)) +				es->s_flags |= +					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); +			sbi->s_hash_unsigned = 3;  #else -		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +			if (!(sb->s_flags & MS_RDONLY)) +				es->s_flags |= +					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);  #endif +		}  	}  	/* Handle clustersize */ @@ -3967,6 +4035,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);  no_journal: +	if (ext4_mballoc_ready) { +		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); +		if (!sbi->s_mb_cache) { +			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); +			goto failed_mount_wq; +		} +	} +  	/*  	 * Get the # of file system overhead blocks from the  	 * superblock if present. @@ -4043,10 +4119,10 @@ no_journal:  			 "available");  	} -	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); +	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));  	if (err) {  		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " -			 "reserved pool", ext4_calculate_resv_clusters(sbi)); +			 "reserved pool", ext4_calculate_resv_clusters(sb));  		goto failed_mount4a;  	} @@ -4118,6 +4194,11 @@ no_journal:  	if (es->s_error_count)  		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ +	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */ +	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); +	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); +	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); +  	kfree(orig_data);  	return 0; @@ -4151,7 +4232,7 @@ failed_mount_wq:  	}  failed_mount3:  	ext4_es_unregister_shrinker(sbi); -	del_timer(&sbi->s_err_report); +	del_timer_sync(&sbi->s_err_report);  	if (sbi->s_flex_groups)  		ext4_kvfree(sbi->s_flex_groups);  	percpu_counter_destroy(&sbi->s_freeclusters_counter); @@ -4787,6 +4868,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  		}  		if (*flags & MS_RDONLY) { +			err = sync_filesystem(sb); +			if (err < 0) +				goto restore_opts;  			err = dquot_suspend(sb, -1);  			if (err < 0)  				goto restore_opts; @@ -5293,6 +5377,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,  	bh = ext4_bread(handle, inode, blk, 1, &err);  	if (!bh)  		goto out; +	BUFFER_TRACE(bh, "get write access");  	err = ext4_journal_get_write_access(handle, bh);  	if (err) {  		brelse(bh); @@ -5468,11 +5553,9 @@ static int __init ext4_init_fs(void)  	err = ext4_init_mballoc();  	if (err) -		goto out3; - -	err = ext4_init_xattr(); -	if (err)  		goto out2; +	else +		ext4_mballoc_ready = 1;  	err = init_inodecache();  	if (err)  		goto out1; @@ -5488,10 +5571,9 @@ out:  	unregister_as_ext3();  	destroy_inodecache();  out1: -	ext4_exit_xattr(); -out2: +	ext4_mballoc_ready = 0;  	ext4_exit_mballoc(); -out3: +out2:  	ext4_exit_feat_adverts();  out4:  	if (ext4_proc_root) @@ -5514,7 +5596,6 @@ static void __exit ext4_exit_fs(void)  	unregister_as_ext3();  	unregister_filesystem(&ext4_fs_type);  	destroy_inodecache(); -	ext4_exit_xattr();  	ext4_exit_mballoc();  	ext4_exit_feat_adverts();  	remove_proc_entry("fs/ext4", NULL); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c081e34f717..e7387337060 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -81,7 +81,7 @@  # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)  #endif -static void ext4_xattr_cache_insert(struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);  static struct buffer_head *ext4_xattr_cache_find(struct inode *,  						 struct ext4_xattr_header *,  						 struct mb_cache_entry **); @@ -90,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,  static int ext4_xattr_list(struct dentry *dentry, char *buffer,  			   size_t buffer_size); -static struct mb_cache *ext4_xattr_cache; -  static const struct xattr_handler *ext4_xattr_handler_map[] = {  	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,  #ifdef CONFIG_EXT4_FS_POSIX_ACL -	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler, -	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, +	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler, +	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,  #endif  	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,  #ifdef CONFIG_EXT4_FS_SECURITY @@ -108,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {  	&ext4_xattr_user_handler,  	&ext4_xattr_trusted_handler,  #ifdef CONFIG_EXT4_FS_POSIX_ACL -	&ext4_xattr_acl_access_handler, -	&ext4_xattr_acl_default_handler, +	&posix_acl_access_xattr_handler, +	&posix_acl_default_xattr_handler,  #endif  #ifdef CONFIG_EXT4_FS_SECURITY  	&ext4_xattr_security_handler, @@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {  	NULL  }; +#define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \ +				inode->i_sb->s_fs_info)->s_mb_cache) +  static __le32 ext4_xattr_block_csum(struct inode *inode,  				    sector_t block_nr,  				    struct ext4_xattr_header *hdr) @@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,  	struct ext4_xattr_entry *entry;  	size_t size;  	int error; +	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);  	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",  		  name_index, name, buffer, (long)buffer_size); @@ -286,7 +288,7 @@ bad_block:  		error = -EIO;  		goto cleanup;  	} -	ext4_xattr_cache_insert(bh); +	ext4_xattr_cache_insert(ext4_mb_cache, bh);  	entry = BFIRST(bh);  	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);  	if (error == -EIO) @@ -367,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,  {  	int error; +	if (strlen(name) > 255) +		return -ERANGE; +  	down_read(&EXT4_I(inode)->xattr_sem);  	error = ext4_xattr_ibody_get(inode, name_index, name, buffer,  				     buffer_size); @@ -409,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)  	struct inode *inode = dentry->d_inode;  	struct buffer_head *bh = NULL;  	int error; +	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);  	ea_idebug(inode, "buffer=%p, buffer_size=%ld",  		  buffer, (long)buffer_size); @@ -430,7 +436,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)  		error = -EIO;  		goto cleanup;  	} -	ext4_xattr_cache_insert(bh); +	ext4_xattr_cache_insert(ext4_mb_cache, bh);  	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);  cleanup: @@ -510,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,  	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))  		return; +	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");  	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {  		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);  		ext4_handle_dirty_super(handle, sb); @@ -517,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,  }  /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block.   */  static void  ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -526,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,  {  	struct mb_cache_entry *ce = NULL;  	int error = 0; +	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); -	ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); +	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); +	BUFFER_TRACE(bh, "get_write_access");  	error = ext4_journal_get_write_access(handle, bh);  	if (error)  		goto out; @@ -538,16 +547,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,  		if (ce)  			mb_cache_entry_free(ce);  		get_bh(bh); +		unlock_buffer(bh);  		ext4_free_blocks(handle, inode, bh, 0, 1,  				 EXT4_FREE_BLOCKS_METADATA |  				 EXT4_FREE_BLOCKS_FORGET); -		unlock_buffer(bh);  	} else {  		le32_add_cpu(&BHDR(bh)->h_refcount, -1);  		if (ce)  			mb_cache_entry_release(ce); +		/* +		 * Beware of this ugliness: Releasing of xattr block references +		 * from different inodes can race and so we have to protect +		 * from a race where someone else frees the block (and releases +		 * its journal_head) before we are done dirtying the buffer. In +		 * nojournal mode this race is harmless and we actually cannot +		 * call ext4_handle_dirty_xattr_block() with locked buffer as +		 * that function can call sync_dirty_buffer() so for that case +		 * we handle the dirtying after unlocking the buffer. +		 */ +		if (ext4_handle_valid(handle)) +			error = ext4_handle_dirty_xattr_block(handle, inode, +							      bh);  		unlock_buffer(bh); -		error = ext4_handle_dirty_xattr_block(handle, inode, bh); +		if (!ext4_handle_valid(handle)) +			error = ext4_handle_dirty_xattr_block(handle, inode, +							      bh);  		if (IS_SYNC(inode))  			ext4_handle_sync(handle);  		dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -567,12 +591,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,  				    size_t *min_offs, void *base, int *total)  {  	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { -		*total += EXT4_XATTR_LEN(last->e_name_len);  		if (!last->e_value_block && last->e_value_size) {  			size_t offs = le16_to_cpu(last->e_value_offs);  			if (offs < *min_offs)  				*min_offs = offs;  		} +		if (total) +			*total += EXT4_XATTR_LEN(last->e_name_len);  	}  	return (*min_offs - ((void *)last - base) - sizeof(__u32));  } @@ -745,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,  	struct ext4_xattr_search *s = &bs->s;  	struct mb_cache_entry *ce = NULL;  	int error = 0; +	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);  #define header(x) ((struct ext4_xattr_header *)(x))  	if (i->value && i->value_len > sb->s_blocksize)  		return -ENOSPC;  	if (s->base) { -		ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, +		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,  					bs->bh->b_blocknr); +		BUFFER_TRACE(bs->bh, "get_write_access");  		error = ext4_journal_get_write_access(handle, bs->bh);  		if (error)  			goto cleanup; @@ -769,7 +796,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,  				if (!IS_LAST_ENTRY(s->first))  					ext4_xattr_rehash(header(s->base),  							  s->here); -				ext4_xattr_cache_insert(bs->bh); +				ext4_xattr_cache_insert(ext4_mb_cache, +					bs->bh);  			}  			unlock_buffer(bs->bh);  			if (error == -EIO) @@ -837,6 +865,7 @@ inserted:  						EXT4_C2B(EXT4_SB(sb), 1));  				if (error)  					goto cleanup; +				BUFFER_TRACE(new_bh, "get_write_access");  				error = ext4_journal_get_write_access(handle,  								      new_bh);  				if (error) @@ -874,7 +903,7 @@ inserted:  			 * take i_data_sem because we will test  			 * i_delalloc_reserved_flag in ext4_mb_new_blocks  			 */ -			down_read((&EXT4_I(inode)->i_data_sem)); +			down_read(&EXT4_I(inode)->i_data_sem);  			block = ext4_new_meta_blocks(handle, inode, goal, 0,  						     NULL, &error);  			up_read((&EXT4_I(inode)->i_data_sem)); @@ -905,7 +934,7 @@ getblk_failed:  			memcpy(new_bh->b_data, s->base, new_bh->b_size);  			set_buffer_uptodate(new_bh);  			unlock_buffer(new_bh); -			ext4_xattr_cache_insert(new_bh); +			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);  			error = ext4_handle_dirty_xattr_block(handle,  							      inode, new_bh);  			if (error) @@ -1228,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,  	struct ext4_xattr_block_find *bs = NULL;  	char *buffer = NULL, *b_entry_name = NULL;  	size_t min_offs, free; -	int total_ino, total_blk; +	int total_ino;  	void *base, *start, *end;  	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;  	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1286,8 +1315,7 @@ retry:  		first = BFIRST(bh);  		end = bh->b_data + bh->b_size;  		min_offs = end - base; -		free = ext4_xattr_free_space(first, &min_offs, base, -					     &total_blk); +		free = ext4_xattr_free_space(first, &min_offs, base, NULL);  		if (free < new_extra_isize) {  			if (!tried_min_extra_isize && s_min_extra_isize) {  				tried_min_extra_isize++; @@ -1350,6 +1378,9 @@ retry:  				    s_min_extra_isize) {  					tried_min_extra_isize++;  					new_extra_isize = s_min_extra_isize; +					kfree(is); is = NULL; +					kfree(bs); bs = NULL; +					brelse(bh);  					goto retry;  				}  				error = -1; @@ -1492,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb)   * Returns 0, or a negative error number on failure.   */  static void -ext4_xattr_cache_insert(struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)  {  	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);  	struct mb_cache_entry *ce;  	int error; -	ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); +	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);  	if (!ce) {  		ea_bdebug(bh, "out of memory");  		return; @@ -1570,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,  {  	__u32 hash = le32_to_cpu(header->h_hash);  	struct mb_cache_entry *ce; +	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);  	if (!header->h_hash)  		return NULL;  /* never share */  	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);  again: -	ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, +	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,  				       hash);  	while (ce) {  		struct buffer_head *bh; @@ -1673,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,  #undef BLOCK_HASH_SHIFT -int __init -ext4_init_xattr(void) +#define	HASH_BUCKET_BITS	10 + +struct mb_cache * +ext4_xattr_create_cache(char *name)  { -	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); -	if (!ext4_xattr_cache) -		return -ENOMEM; -	return 0; +	return mb_cache_create(name, HASH_BUCKET_BITS);  } -void -ext4_exit_xattr(void) +void ext4_xattr_destroy_cache(struct mb_cache *cache)  { -	if (ext4_xattr_cache) -		mb_cache_destroy(ext4_xattr_cache); -	ext4_xattr_cache = NULL; +	if (cache) +		mb_cache_destroy(cache);  } + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c767dbdd7fc..29bedf5589f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {  extern const struct xattr_handler ext4_xattr_user_handler;  extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler;  extern const struct xattr_handler ext4_xattr_security_handler;  extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -112,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,  			    struct ext4_inode *raw_inode, handle_t *handle); -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); -  extern const struct xattr_handler *ext4_xattr_handlers[];  extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, @@ -126,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,  				       struct ext4_xattr_info *i,  				       struct ext4_xattr_ibody_find *is); +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); +  #ifdef CONFIG_EXT4_FS_SECURITY  extern int ext4_init_security(handle_t *handle, struct inode *inode,  			      struct inode *dir, const struct qstr *qstr);  | 
