aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/acl.c223
-rw-r--r--fs/ext4/acl.h9
-rw-r--r--fs/ext4/balloc.c97
-rw-r--r--fs/ext4/block_validity.c33
-rw-r--r--fs/ext4/dir.c38
-rw-r--r--fs/ext4/ext4.h121
-rw-r--r--fs/ext4/ext4_extents.h22
-rw-r--r--fs/ext4/ext4_jbd2.c26
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c1130
-rw-r--r--fs/ext4/extents_status.c42
-rw-r--r--fs/ext4/extents_status.h9
-rw-r--r--fs/ext4/file.c180
-rw-r--r--fs/ext4/ialloc.c39
-rw-r--r--fs/ext4/indirect.c38
-rw-r--r--fs/ext4/inline.c44
-rw-r--r--fs/ext4/inode.c385
-rw-r--r--fs/ext4/ioctl.c37
-rw-r--r--fs/ext4/mballoc.c80
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c84
-rw-r--r--fs/ext4/namei.c623
-rw-r--r--fs/ext4/page-io.c50
-rw-r--r--fs/ext4/resize.c49
-rw-r--r--fs/ext4/super.c269
-rw-r--r--fs/ext4/xattr.c102
-rw-r--r--fs/ext4/xattr.h8
29 files changed, 2321 insertions, 1433 deletions
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 39a54a0e9fe..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext4_new_inode
*/
static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-/*
- * Initialize the ACLs of a new inode. Called from ext4_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
int
-ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (error < 0)
- return error;
-
- if (error > 0) {
- /* This is an extended ACL */
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
- }
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
handle_t *handle;
- int retries = 0;
- int error;
-
+ int error, retries = 0;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
ext4_jbd2_credits_xattr(inode));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext4_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ error = __ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
- if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-out:
- posix_acl_release(acl);
return error;
}
/*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
*/
-static size_t
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl;
+ struct posix_acl *default_acl, *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext4_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_XATTR,
- ext4_jbd2_credits_xattr(inode));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto release_and_out;
+ if (default_acl) {
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
}
- error = ext4_set_acl(handle, inode, type, acl);
- ext4_journal_stop(handle);
- if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-
-const struct xattr_handler ext4_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext4_xattr_list_acl_access,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
-
-const struct xattr_handler ext4_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext4_xattr_list_acl_default,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 18cb39ed7c7..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-extern int ext4_acl_chmod(struct inode *);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext4_get_acl NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext4_set_acl NULL
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index dc5d572ebd6..fca382037dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb,
/* Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
-unsigned ext4_num_overhead_clusters(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
unsigned num_clusters;
int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
@@ -176,9 +176,10 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
}
/* Initializes an uninitialized block bitmap */
-void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+static void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
unsigned int bit, bit_max;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -193,7 +194,16 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -307,6 +317,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
ext4_fsblk_t blk;
@@ -326,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
@@ -341,21 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- offset + EXT4_SB(sb)->s_itb_per_group,
- offset);
- if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+ EXT4_B2C(sbi, offset));
+ if (next_zero_bit <
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
/* bad bitmap for inode tables */
return blk;
return 0;
}
-void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh)
+static void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return;
@@ -366,6 +379,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -373,6 +389,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -640,6 +659,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
@@ -655,14 +675,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_group_clusters(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
x = ext4_count_free(bitmap_bh->b_data,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
@@ -679,7 +703,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_group_clusters(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
@@ -699,16 +727,6 @@ static inline int test_root(ext4_group_t a, int b)
}
}
-static int ext4_group_sparse(ext4_group_t group)
-{
- if (group <= 1)
- return 1;
- if (!(group & 1))
- return 0;
- return (test_root(group, 7) || test_root(group, 5) ||
- test_root(group, 3));
-}
-
/**
* ext4_bg_has_super - number of blocks used by the superblock in group
* @sb: superblock for filesystem
@@ -719,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
- !ext4_group_sparse(group))
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (group == 0)
+ return 1;
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+ group == le32_to_cpu(es->s_backup_bgs[1]))
+ return 1;
return 0;
- return 1;
+ }
+ if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ return 1;
+ if (!(group & 1))
+ return 0;
+ if (test_root(group, 3) || (test_root(group, 5)) ||
+ test_root(group, 7))
+ return 1;
+
+ return 0;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72..41eb9dcfac7 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
- struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
- struct rb_node *parent;
- struct ext4_system_zone *entry;
+ struct ext4_system_zone *entry, *n;
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- entry = rb_entry(n, struct ext4_system_zone, node);
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &EXT4_SB(sb)->system_blks, node)
kmem_cache_free(ext4_system_zone_cachep, entry);
- if (!parent)
- EXT4_SB(sb)->system_blks = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
EXT4_SB(sb)->system_blks = RB_ROOT;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb338891..ef1bed66c14 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -105,7 +105,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
unsigned int offset;
- int i, stored;
+ int i;
struct ext4_dir_entry_2 *de;
int err;
struct inode *inode = file_inode(file);
@@ -133,7 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return ret;
}
- stored = 0;
offset = ctx->pos & (sb->s_blocksize - 1);
while (ctx->pos < inode->i_size) {
@@ -353,41 +352,16 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
struct fname *old = fname;
fname = fname->next;
kfree(old);
}
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
+ *root = RB_ROOT;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af815ea9d7c..7cc5a0e2368 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,7 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
#include <crypto/hash.h>
+#include <linux/falloc.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -156,7 +158,6 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED (1 << BH_Mapped)
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
-#define EXT4_MAP_UNINIT (1 << BH_Uninit)
/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
* ext4_map_blocks wants to know whether or not the underlying cluster has
* already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
@@ -167,7 +168,7 @@ struct ext4_allocation_request {
#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
+ EXT4_MAP_FROM_CLUSTER)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -182,7 +183,7 @@ struct ext4_map_blocks {
#define EXT4_IO_END_UNWRITTEN 0x0001
/*
- * For converting uninitialized extents on a work queue. 'handle' is used for
+ * For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
*/
typedef struct ext4_io_end {
@@ -267,6 +268,16 @@ struct ext4_io_submit {
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
(sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \
+ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
+ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
+ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \
+ ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/*
* Structure of a blocks group descriptor
@@ -525,26 +536,26 @@ enum {
/*
* Flags used by ext4_map_blocks()
*/
- /* Allocate any needed blocks and/or convert an unitialized
+ /* Allocate any needed blocks and/or convert an unwritten
extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE 0x0001
- /* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
-#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
+ /* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
- unitialized extents if not allocated, split the uninitialized
+ unwritten extents if not allocated, split the unwritten
extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO 0x0008
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -556,6 +567,8 @@ enum {
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/* Do not put hole in extent cache */
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
+ /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
/*
* The bit position of these flags must not overlap with any of the
@@ -760,6 +773,8 @@ do { \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
(einode)->xtime.tv_sec = \
(signed)le32_to_cpu((raw_inode)->xtime); \
+ else \
+ (einode)->xtime.tv_sec = 0; \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
ext4_decode_extra_time(&(einode)->xtime, \
raw_inode->xtime ## _extra); \
@@ -860,6 +875,8 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
+
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -985,6 +1002,8 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
size of blocksize * 8
blocks */
+#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
+ file systems */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1141,7 +1160,8 @@ struct ext4_super_block {
__le32 s_usr_quota_inum; /* inode for tracking user quota */
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
- __le32 s_reserved[108]; /* Padding to the end of the block */
+ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
+ __le32 s_reserved[106]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1313,7 +1333,13 @@ struct ext4_sb_info {
struct list_head s_es_lru;
unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
+ struct mb_cache *s_mb_cache;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+ struct ratelimit_state s_err_ratelimit_state;
+ struct ratelimit_state s_warning_ratelimit_state;
+ struct ratelimit_state s_msg_ratelimit_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1396,7 +1422,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
}
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)
@@ -1470,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1918,10 +1956,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
-extern void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1950,16 +1984,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb,
struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
-extern void ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
-extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
/* dir.c */
@@ -2102,10 +2129,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2117,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs);
+ struct iov_iter *iter, loff_t offset);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -2165,8 +2187,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
extern int ext4_calculate_overhead(struct super_block *sb);
-extern int ext4_superblock_csum_verify(struct super_block *sb,
- struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
@@ -2433,23 +2453,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/*
- * Update i_disksize after writeback has been started. Races with truncate
- * are avoided by checking i_size under i_data_sem.
- */
-static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
-{
- loff_t i_size;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- i_size = i_size_read(inode);
- if (newsize > i_size)
- newsize = i_size;
- if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
- up_write(&EXT4_I(inode)->i_data_sem);
-}
-
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
@@ -2555,19 +2558,11 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
-extern void ext4_unwritten_wait(struct inode *inode);
/* inline.c */
extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
- struct ext4_iloc *iloc,
- void *buffer, loff_t pos,
- unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
@@ -2728,17 +2723,18 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode);
-void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
-void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent);
/* page-io.c */
extern int __init ext4_init_pageio(void);
@@ -2754,23 +2750,20 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc);
+ struct writeback_control *wbc,
+ bool keep_towrite);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
-extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
-extern int ext4_mmp_csum_verify(struct super_block *sb,
- struct mmp_struct *mmp);
/*
* Note that these flags will never ever appear in a buffer_head's state flag.
* See EXT4_MAP_... to see where this is used.
*/
enum ext4_state_bits {
- BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
- BH_AllocFromCluster, /* allocated blocks were part of already
+ BH_AllocFromCluster /* allocated blocks were part of already
* allocated cluster. */
+ = BH_JBDPrivateStart
};
/*
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 5074fe23f19..a867f5ca999 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -137,21 +137,21 @@ struct ext4_ext_path {
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
- * particular extent is an initialized extent or an uninitialized (i.e.
+ * particular extent is an initialized extent or an unwritten (i.e.
* preallocated).
- * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
- * uninitialized extent.
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
* If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
- * uninitialized one. In other words, if MSB of ee_len is set, it is an
- * uninitialized extent with only one special scenario when ee_len = 0x8000.
- * In this case we can not have an uninitialized extent of zero length and
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
* thus we make it as a special case of initialized extent with 0x8000 length.
* This way we get better extent-to-group alignment for initialized extents.
* Hence, the maximum number of blocks we can have in an *initialized*
- * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
*/
#define EXT_INIT_MAX_LEN (1UL << 15)
-#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1)
#define EXT_FIRST_EXTENT(__hdr__) \
@@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
- /* We can not have an uninitialized extent of zero length! */
+ /* We can not have an unwritten extent of zero length! */
BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}
-static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
/* Extent with ee_len of 0x8000 is treated as an initialized extent */
return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 17ac112ab10..0074e0d23d6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -122,9 +122,10 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn, struct buffer_head *bh,
- handle_t *handle, int err)
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn,
+ struct buffer_head *bh,
+ handle_t *handle, int err)
{
char nbuf[16];
const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -259,6 +260,25 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
if (WARN_ON_ONCE(err)) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
+ if (inode == NULL) {
+ pr_err("EXT4: jbd2_journal_dirty_metadata "
+ "failed: handle type %u started at "
+ "line %u, credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ return err;
+ }
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "journal_dirty_metadata failed: "
+ "handle type %u started at line %u, "
+ "credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 81cfefa9dc0..17c00ff202f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -231,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD.
*/
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err);
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54d52afcdb1..4da228a0e6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/falloc.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
@@ -51,8 +50,8 @@
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
@@ -144,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
{
if (path->p_bh) {
/* path points to block */
+ BUFFER_TRACE(path->p_bh, "get_write_access");
return ext4_journal_get_write_access(handle, path->p_bh);
}
/* path points to leaf/index in inode body */
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
+ ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+ ext4_lblk_t last = lblock + len - 1;
- if (len == 0)
+ if (lblock > last)
return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ ext4_fsblk_t pblock = 0;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+ int len = 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
+
+ /* Check for overlapping extents */
+ lblock = le32_to_cpu(ext->ee_block);
+ len = ext4_ext_get_actual_len(ext);
+ if ((lblock <= prev) && prev) {
+ pblock = ext4_ext_pblock(ext);
+ es->s_last_error_block = cpu_to_le64(pblock);
+ return 0;
+ }
ext++;
entries--;
+ prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
@@ -508,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
lblk - prev, ~0,
EXTENT_STATUS_HOLE);
- if (ext4_ext_is_uninitialized(ex))
+ if (ext4_ext_is_unwritten(ex))
status = EXTENT_STATUS_UNWRITTEN;
ext4_es_cache_extent(inode, lblk, len,
ext4_ext_pblock(ex), status);
@@ -604,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
} else if (path->p_ext) {
ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
@@ -630,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
ext_debug("\n");
@@ -661,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
newblock);
ex++;
@@ -786,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode,
ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -1666,22 +1683,17 @@ int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- unsigned short ext1_ee_len, ext2_ee_len, max_len;
+ unsigned short ext1_ee_len, ext2_ee_len;
/*
* Make sure that both extents are initialized. We don't merge
- * uninitialized extents so that we can be sure that end_io code has
+ * unwritten extents so that we can be sure that end_io code has
* the extent that was written properly split out and conversion to
* initialized is trivial.
*/
- if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
return 0;
- if (ext4_ext_is_uninitialized(ex1))
- max_len = EXT_UNINIT_MAX_LEN;
- else
- max_len = EXT_INIT_MAX_LEN;
-
ext1_ee_len = ext4_ext_get_actual_len(ex1);
ext2_ee_len = ext4_ext_get_actual_len(ex2);
@@ -1694,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (ext1_ee_len + ext2_ee_len > max_len)
+ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+ return 0;
+ if (ext4_ext_is_unwritten(ex1) &&
+ (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+ atomic_read(&EXT4_I(inode)->i_unwritten) ||
+ (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1719,8 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
{
struct ext4_extent_header *eh;
unsigned int depth, len;
- int merge_done = 0;
- int uninitialized = 0;
+ int merge_done = 0, unwritten;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1730,12 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
break;
/* merge with next extent! */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(ex + 1));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
if (ex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1844,8 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
depth = ext_depth(inode);
if (!path[depth].p_ext)
goto out;
- b2 = le32_to_cpu(path[depth].p_ext->ee_block);
- b2 &= ~(sbi->s_cluster_ratio - 1);
+ b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
/*
* get the next allocated block if the extent in the path
@@ -1855,7 +1869,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
b2 = ext4_ext_next_allocated_block(path);
if (b2 == EXT_MAX_BLOCKS)
goto out;
- b2 &= ~(sbi->s_cluster_ratio - 1);
+ b2 = EXT4_LBLK_CMASK(sbi, b2);
}
/* check for wrap through zero on extent logical start block*/
@@ -1890,8 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *npath = NULL;
int depth, len, err;
ext4_lblk_t next;
- unsigned uninitialized = 0;
- int mb_flags = 0;
+ int mb_flags = 0, unwritten;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1931,29 +1944,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
if (ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %u:[%d]%d"
"(from %llu)\n",
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex));
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
return err;
-
- /*
- * ext4_can_extents_be_merged should have checked
- * that either both extents are uninitialized, or
- * both aren't. Thus we need to check only one of
- * them here.
- */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1965,10 +1970,10 @@ prepend:
ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
"(from %llu)\n",
le32_to_cpu(newext->ee_block),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex));
err = ext4_ext_get_access(handle, inode,
@@ -1976,20 +1981,13 @@ prepend:
if (err)
return err;
- /*
- * ext4_can_extents_be_merged should have checked
- * that either both extents are uninitialized, or
- * both aren't. Thus we need to check only one of
- * them here.
- */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_block = newext->ee_block;
ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -2049,7 +2047,7 @@ has_space:
ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext));
nearex = EXT_FIRST_EXTENT(eh);
} else {
@@ -2060,7 +2058,7 @@ has_space:
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
nearex);
nearex++;
@@ -2071,7 +2069,7 @@ has_space:
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
nearex);
}
@@ -2081,7 +2079,7 @@ has_space:
"move %d extents from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
len, nearex, nearex + 1);
memmove(nearex + 1, nearex,
@@ -2203,7 +2201,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
es.es_lblk = le32_to_cpu(ex->ee_block);
es.es_len = ext4_ext_get_actual_len(ex);
es.es_pblk = ext4_ext_pblock(ex);
- if (ext4_ext_is_uninitialized(ex))
+ if (ext4_ext_is_unwritten(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
@@ -2535,7 +2533,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* extent, we have to mark the cluster as used (store negative
* cluster number in partial_cluster).
*/
- unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ unaligned = EXT4_PBLK_COFF(sbi, pblk);
if (unaligned && (ee_len == num) &&
(*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
@@ -2579,7 +2577,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
- unsigned uninitialized = 0;
+ unsigned unwritten = 0;
struct ext4_extent *ex;
ext4_fsblk_t pblk;
@@ -2600,18 +2598,39 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
+ /*
+ * If we're starting with an extent other than the last one in the
+ * node, we need to see if it shares a cluster with the extent to
+ * the right (towards the end of the file). If its leftmost cluster
+ * is this extent's rightmost cluster and it is not cluster aligned,
+ * we'll mark it as a partial that is not to be deallocated.
+ */
+
+ if (ex != EXT_LAST_EXTENT(eh)) {
+ ext4_fsblk_t current_pblk, right_pblk;
+ long long current_cluster, right_cluster;
+
+ current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+ right_pblk = ext4_ext_pblock(ex + 1);
+ right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+ if (current_cluster == right_cluster &&
+ EXT4_PBLK_COFF(sbi, right_pblk))
+ *partial_cluster = -right_cluster;
+ }
+
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ if (ext4_ext_is_unwritten(ex))
+ unwritten = 1;
else
- uninitialized = 0;
+ unwritten = 0;
ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
- uninitialized, ex_ee_len);
+ unwritten, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2629,7 +2648,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* accidentally freeing it later on
*/
pblk = ext4_ext_pblock(ex);
- if (pblk & (sbi->s_cluster_ratio - 1))
+ if (EXT4_PBLK_COFF(sbi, pblk))
*partial_cluster =
-((long long)EXT4_B2C(sbi, pblk));
ex--;
@@ -2683,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex->ee_len = cpu_to_le16(num);
/*
- * Do not mark uninitialized if all the blocks in the
+ * Do not mark unwritten if all the blocks in the
* extent have been removed.
*/
- if (uninitialized && num)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten && num)
+ ext4_ext_mark_unwritten(ex);
/*
* If the extent was completely released,
* we need to remove it from the leaf
@@ -2725,10 +2744,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * Free the partial cluster only if the current extent does not
- * reference it. Otherwise we might free used cluster.
+ * If there's a partial cluster and at least one extent remains in
+ * the leaf, free the partial cluster if it isn't shared with the
+ * current extent. If there's a partial cluster and no extents
+ * remain in the leaf, it can't be freed here. It can only be
+ * freed when it's possible to determine if it's not shared with
+ * any other extent - when the next leaf is processed or when space
+ * removal is complete.
*/
- if (*partial_cluster > 0 &&
+ if (*partial_cluster > 0 && eh->eh_entries &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
int flags = get_default_free_blocks_flags(inode);
@@ -2831,9 +2855,9 @@ again:
end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
int split_flag = 0;
- if (ext4_ext_is_uninitialized(ex))
- split_flag = EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2;
+ if (ext4_ext_is_unwritten(ex))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
/*
* Split the extent in two so that 'end' is the last
@@ -3090,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* @path: the path to the extent
* @split: the logical block where the extent is splitted.
* @split_flags: indicates if the extent could be zeroout if split fails, and
- * the states(init or uninit) of new extents.
+ * the states(init or unwritten) of new extents.
* @flags: flags used to insert new extent to extent tree.
*
*
@@ -3132,10 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle,
newblock = split - ee_block + ext4_ext_pblock(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
- BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+ BUG_ON(!ext4_ext_is_unwritten(ex) &&
split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2));
+ EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3147,8 +3171,8 @@ static int ext4_split_extent_at(handle_t *handle,
* then we just change the state of the extent, and splitting
* is not needed.
*/
- if (split_flag & EXT4_EXT_MARK_UNINIT2)
- ext4_ext_mark_uninitialized(ex);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex);
else
ext4_ext_mark_initialized(ex);
@@ -3162,8 +3186,8 @@ static int ext4_split_extent_at(handle_t *handle,
/* case a */
memcpy(&orig_ex, ex, sizeof(orig_ex));
ex->ee_len = cpu_to_le16(split - ee_block);
- if (split_flag & EXT4_EXT_MARK_UNINIT1)
- ext4_ext_mark_uninitialized(ex);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+ ext4_ext_mark_unwritten(ex);
/*
* path may lead to new leaf, not to original leaf any more
@@ -3177,8 +3201,8 @@ static int ext4_split_extent_at(handle_t *handle,
ex2->ee_block = cpu_to_le32(split);
ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
ext4_ext_store_pblock(ex2, newblock);
- if (split_flag & EXT4_EXT_MARK_UNINIT2)
- ext4_ext_mark_uninitialized(ex2);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
@@ -3255,7 +3279,7 @@ static int ext4_split_extent(handle_t *handle,
struct ext4_extent *ex;
unsigned int ee_len, depth;
int err = 0;
- int uninitialized;
+ int unwritten;
int split_flag1, flags1;
int allocated = map->m_len;
@@ -3263,14 +3287,14 @@ static int ext4_split_extent(handle_t *handle,
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- uninitialized = ext4_ext_is_uninitialized(ex);
+ unwritten = ext4_ext_is_unwritten(ex);
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
- if (uninitialized)
- split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2;
+ if (unwritten)
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
err = ext4_split_extent_at(handle, inode, path,
@@ -3290,15 +3314,20 @@ static int ext4_split_extent(handle_t *handle,
return PTR_ERR(path);
depth = ext_depth(inode);
ex = path[depth].p_ext;
- uninitialized = ext4_ext_is_uninitialized(ex);
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
- if (uninitialized) {
- split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+ if (unwritten) {
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNINIT2);
+ EXT4_EXT_MARK_UNWRIT2);
}
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
@@ -3313,16 +3342,16 @@ out:
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
+ * to an unwritten extent. It may result in splitting the unwritten
* extent into multiple extents (up to three - one initialized and two
- * uninitialized).
+ * unwritten).
* There are three possibilities:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
* Pre-conditions:
- * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' is unwritten.
* - The extent pointed to by 'path' contains a superset
* of the logical span [map->m_lblk, map->m_lblk + map->m_len).
*
@@ -3368,12 +3397,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
/* Pre-conditions */
- BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!ext4_ext_is_unwritten(ex));
BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
/*
* Attempt to transfer newly initialized blocks from the currently
- * uninitialized extent to its neighbor. This is much cheaper
+ * unwritten extent to its neighbor. This is much cheaper
* than an insertion followed by a merge as those involve costly
* memmove() calls. Transferring to the left is the common case in
* steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
@@ -3409,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
((prev_lblk + prev_len) == ee_block) && /*C2*/
((prev_pblk + prev_len) == ee_pblk) && /*C3*/
(prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
@@ -3424,7 +3453,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_block = cpu_to_le32(ee_block + map_len);
ext4_ext_store_pblock(ex, ee_pblk + map_len);
ex->ee_len = cpu_to_le16(ee_len - map_len);
- ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
/* Extend abut_ex by 'map_len' blocks */
abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
@@ -3455,7 +3484,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
((map->m_lblk + map_len) == next_lblk) && /*C2*/
((ee_pblk + ee_len) == next_pblk) && /*C3*/
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
@@ -3470,7 +3499,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
ex->ee_len = cpu_to_le16(ee_len - map_len);
- ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
/* Extend abut_ex by 'map_len' blocks */
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
@@ -3492,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
@@ -3575,26 +3604,28 @@ out:
/*
* This function is called by ext4_ext_map_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
- * to an uninitialized extent.
+ * to an unwritten extent.
*
- * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple initialized/uninitialized extents (up to three)
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
* There are three possibilities:
- * a> There is no split required: Entire extent should be uninitialized
+ * a> There is no split required: Entire extent should be unwritten
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
* One of more index blocks maybe needed if the extent tree grow after
- * the uninitialized extent split. To prevent ENOSPC occur at the IO
- * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitialized extent called at this time will be split
- * into three uninitialized extent(at most). After IO complete, the part
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of uninitialized extent to be written on success.
+ * Returns the size of unwritten extent to be written on success.
*/
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path,
@@ -3606,9 +3637,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
unsigned int ee_len;
int split_flag = 0, depth;
- ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+ __func__, inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
@@ -3623,14 +3654,79 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- split_flag |= EXT4_EXT_MARK_UNINIT2;
- if (flags & EXT4_GET_BLOCKS_CONVERT)
- split_flag |= EXT4_EXT_DATA_VALID2;
+ /* Convert to unwritten */
+ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ split_flag |= EXT4_EXT_DATA_VALID1;
+ /* Convert to initialized */
+ } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ split_flag |= ee_block + ee_len <= eof_block ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+ }
flags |= EXT4_GET_BLOCKS_PRE_IO;
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
+static int ext4_convert_initialized_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("%s: inode %lu, logical"
+ "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as unwritten */
+ ext4_ext_mark_unwritten(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
@@ -3664,8 +3760,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_unwritten_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT);
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
if (err < 0)
goto out;
ext4_ext_drop_refs(path);
@@ -3784,7 +3880,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
- lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3843,9 +3939,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
/* Check towards left side */
- c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
if (c_offset) {
- lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+ lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
lblk_to = lblk_from + c_offset - 1;
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3853,7 +3949,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
}
/* Now check towards right. */
- c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
if (allocated_clusters && c_offset) {
lblk_from = lblk_start + num_blks;
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -3866,7 +3962,39 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
}
static int
-ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+
+ /*
+ * Make sure that the extent is no bigger than we support with
+ * unwritten extent
+ */
+ if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+ map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+ ret = ext4_convert_initialized_extents(handle, inode, map,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+
+ return err ? err : allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
@@ -3875,25 +4003,25 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
int err = 0;
ext4_io_end_t *io = ext4_inode_aio(inode);
- ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+ ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
/*
- * When writing into uninitialized space, we should not fail to
+ * When writing into unwritten space, we should not fail to
* allocate metadata blocks for the new extent block if needed.
*/
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
- trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+ trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
allocated, newblock);
/* get_block() before submit the IO, split the extent */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- ret = ext4_split_unwritten_extents(handle, inode, map,
- path, flags);
+ if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ ret = ext4_split_convert_extents(handle, inode, map,
+ path, flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
/*
@@ -3906,12 +4034,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN;
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
- if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+ if (flags & EXT4_GET_BLOCKS_CONVERT) {
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
path);
if (ret >= 0) {
@@ -3921,6 +4047,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
} else
err = ret;
map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
if (allocated > map->m_len)
allocated = map->m_len;
map->m_len = allocated;
@@ -3931,7 +4058,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* repeat fallocate creation request
* we already have an unwritten extent
*/
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto map_out;
}
@@ -4007,10 +4134,6 @@ out1:
map->m_pblk = newblock;
map->m_len = allocated;
out2:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
return err ? err : allocated;
}
@@ -4061,7 +4184,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
struct ext4_ext_path *path)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
ext4_lblk_t ex_cluster_start, ex_cluster_end;
ext4_lblk_t rr_cluster_start;
ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -4079,8 +4202,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
(rr_cluster_start == ex_cluster_start)) {
if (rr_cluster_start == ex_cluster_end)
ee_start += ee_len - 1;
- map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
- c_offset;
+ map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
map->m_len = min(map->m_len,
(unsigned) sbi->s_cluster_ratio - c_offset);
/*
@@ -4143,7 +4265,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, *ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth;
+ int free_on_err = 0, err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -4185,8 +4307,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len;
+
/*
- * Uninitialized extents are treated as holes, except that
+ * unwritten extents are treated as holes, except that
* we split out initialized portions during a write.
*/
ee_len = ext4_ext_get_actual_len(ex);
@@ -4201,13 +4324,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- if (!ext4_ext_is_uninitialized(ex))
+ /*
+ * If the extent is initialized check whether the
+ * caller wants to convert it to unwritten.
+ */
+ if ((!ext4_ext_is_unwritten(ex)) &&
+ (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+ allocated = ext4_ext_convert_initialized_extent(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ goto out2;
+ } else if (!ext4_ext_is_unwritten(ex))
goto out;
- allocated = ext4_ext_handle_uninitialized_extents(
+ ret = ext4_ext_handle_unwritten_extents(
handle, inode, map, path, flags,
allocated, newblock);
- goto out3;
+ if (ret < 0)
+ err = ret;
+ else
+ allocated = ret;
+ goto out2;
}
}
@@ -4234,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
newex.ee_block = cpu_to_le32(map->m_lblk);
- cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
/*
* If we are doing bigalloc, check to see if the extent returned
@@ -4272,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* See if request is beyond maximum number of blocks we can have in
* a single extent. For an initialized extent this limit is
- * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
- * EXT_UNINIT_MAX_LEN.
+ * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+ * EXT_UNWRITTEN_MAX_LEN.
*/
if (map->m_len > EXT_INIT_MAX_LEN &&
- !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+ !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
map->m_len = EXT_INIT_MAX_LEN;
- else if (map->m_len > EXT_UNINIT_MAX_LEN &&
- (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- map->m_len = EXT_UNINIT_MAX_LEN;
+ else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+ (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+ map->m_len = EXT_UNWRITTEN_MAX_LEN;
/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
newex.ee_len = cpu_to_le16(map->m_len);
@@ -4302,7 +4439,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* needed so that future calls to get_implied_cluster_alloc()
* work correctly.
*/
- offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+ offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
ar.goal -= offset;
ar.logical -= offset;
@@ -4328,21 +4465,19 @@ got_allocated_blocks:
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
- /* Mark uninitialized */
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
- ext4_ext_mark_uninitialized(&newex);
+ /* Mark unwritten */
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
- * uninitialized extent. To avoid unnecessary conversion,
+ * unwritten extent. To avoid unnecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (flags & EXT4_GET_BLOCKS_PRE_IO)
set_unwritten = 1;
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
}
err = 0;
@@ -4469,9 +4604,9 @@ got_allocated_blocks:
/*
* Cache the extent and update transaction to commit on fdatasync only
- * when it is _not_ an uninitialized extent.
+ * when it is _not_ an unwritten extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
else
ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -4488,7 +4623,6 @@ out2:
kfree(path);
}
-out3:
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
ext4_es_lru_add(inode);
@@ -4529,34 +4663,210 @@ retry:
ext4_std_error(inode->i_sb, err);
}
-static void ext4_falloc_update_inode(struct inode *inode,
- int mode, loff_t new_size, int update_ctime)
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+ ext4_lblk_t len, int flags, int mode)
+{
+ struct inode *inode = file_inode(file);
+ handle_t *handle;
+ int ret = 0;
+ int ret2 = 0;
+ int retries = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits;
+
+ map.m_lblk = offset;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
+ */
+ if (len <= EXT_UNWRITTEN_MAX_LEN)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+ while (ret >= 0 && ret < len) {
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = len = len - ret;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret <= 0) {
+ ext4_debug("inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ break;
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (ret2)
+ break;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+
+ return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+ loff_t len, int mode)
{
- struct timespec now;
+ struct inode *inode = file_inode(file);
+ handle_t *handle = NULL;
+ unsigned int max_blocks;
+ loff_t new_size = 0;
+ int ret = 0;
+ int flags;
+ int partial;
+ loff_t start, end;
+ ext4_lblk_t lblk;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned int blkbits = inode->i_blkbits;
+
+ trace_ext4_zero_range(inode, offset, len, mode);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
- if (update_ctime) {
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
}
+
/*
- * Update only when preallocation was requested beyond
- * the file size.
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
*/
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + len - 1);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Round up offset. This is not fallocate, we neet to zero out
+ * blocks, so convert interior block aligned part of the range to
+ * unwritten and possibly manually zero out unaligned parts of the
+ * range.
+ */
+ start = round_up(offset, 1 << blkbits);
+ end = round_down((offset + len), 1 << blkbits);
+
+ if (start < offset || end > offset + len)
+ return -EINVAL;
+ partial = (offset + len) & ((1 << blkbits) - 1);
+
+ lblk = start >> blkbits;
+ max_blocks = (end >> blkbits);
+ if (max_blocks < lblk)
+ max_blocks = 0;
+ else
+ max_blocks -= lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Indirect files do not support unwritten extnets
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out_mutex;
+ /*
+ * If we have a partial block after EOF we have to allocate
+ * the entire block.
+ */
+ if (partial)
+ max_blocks += 1;
+ }
+
+ if (max_blocks > 0) {
+
+ /* Now release the pages and zero block aligned part of pages*/
+ truncate_pagecache_range(inode, start, end - 1);
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Remove entire range from the extent status tree.
+ */
+ ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+ if (ret)
+ goto out_dio;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+ mode);
+ if (ret)
+ goto out_dio;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, ret);
+ goto out_dio;
+ }
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
if (new_size > i_size_read(inode))
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
} else {
/*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if (new_size > i_size_read(inode))
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
@@ -4570,17 +4880,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
handle_t *handle;
- loff_t new_size;
+ loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
- int ret2 = 0;
- int retries = 0;
int flags;
- struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct timespec tv;
+ unsigned int blkbits = inode->i_blkbits;
/* Return error if mode is not supported */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4597,83 +4907,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
+
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ return ext4_zero_range(file, offset, len, mode);
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
- map.m_lblk = offset >> blkbits;
+ lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - map.m_lblk;
- /*
- * credits to insert 1 extent into extent tree
- */
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
- mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
- return ret;
- }
- flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+ - lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
- /*
- * Don't normalize the request if it can fit in one extent so
- * that it doesn't get unnecessarily split into multiple
- * extents.
- */
- if (len <= EXT_UNINIT_MAX_LEN << blkbits)
- flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-retry:
- while (ret >= 0 && ret < max_blocks) {
- map.m_lblk = map.m_lblk + ret;
- map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
- }
- ret = ext4_map_blocks(handle, inode, &map, flags);
- if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
- ext4_warning(inode->i_sb,
- "inode #%lu: block %u: len %u: "
- "ext4_ext_map_blocks returned %d",
- inode->i_ino, map.m_lblk,
- map.m_len, ret);
-#endif
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- break;
- }
- if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
- blkbits) >> blkbits))
- new_size = offset + len;
- else
- new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+ mutex_lock(&inode->i_mutex);
- ext4_falloc_update_inode(inode, mode, new_size,
- (map.m_flags & EXT4_MAP_NEW));
- ext4_mark_inode_dirty(handle, inode);
- if ((file->f_flags & O_SYNC) && ret >= max_blocks)
- ext4_handle_sync(handle);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
- break;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
- goto retry;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ if (ret)
+ goto out;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ goto out;
+
+ tv = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
+ if (new_size > i_size_read(inode)) {
+ i_size_write(inode, new_size);
+ inode->i_mtime = tv;
+ }
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out:
mutex_unlock(&inode->i_mutex);
- trace_ext4_fallocate_exit(inode, offset, max_blocks,
- ret > 0 ? ret2 : ret);
- return ret > 0 ? ret2 : ret;
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+ return ret;
}
/*
@@ -4884,3 +5180,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ext4_es_lru_add(inode);
return error;
}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int credits, err;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ /*
+ * Check if need to extend journal credits
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups
+ */
+ if (handle->h_buffer_credits < 7) {
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ /* EAGAIN is success */
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path);
+ return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+ struct inode *inode, handle_t *handle,
+ ext4_lblk_t *start)
+{
+ int depth, err = 0;
+ struct ext4_extent *ex_start, *ex_last;
+ bool update = 0;
+ depth = path->p_depth;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ ex_start = path[depth].p_ext;
+ if (!ex_start)
+ return -EIO;
+
+ ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ if (!ex_last)
+ return -EIO;
+
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+ update = 1;
+
+ *start = le32_to_cpu(ex_last->ee_block) +
+ ext4_ext_get_actual_len(ex_last);
+
+ while (ex_start <= ex_last) {
+ le32_add_cpu(&ex_start->ee_block, -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
+ }
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (--depth < 0 || !update)
+ break;
+ }
+
+ /* Update index too */
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ /* we are done if current index is not a starting index */
+ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+ break;
+
+ depth--;
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+ ext4_lblk_t start, ext4_lblk_t shift)
+{
+ struct ext4_ext_path *path;
+ int ret = 0, depth;
+ struct ext4_extent *extent;
+ ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t ex_start, ex_end;
+
+ /* Let path point to the last extent */
+ path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+ }
+
+ stop_block = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Nothing to shift, if hole is at the end of file */
+ if (start >= stop_block)
+ return ret;
+
+ /*
+ * Don't start shifting extents until we make sure the hole is big
+ * enough to accomodate the shift.
+ */
+ path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if ((start == ex_start && shift > ex_start) ||
+ (shift > start - ex_end))
+ return -EINVAL;
+
+ /* Its safe to start updating extents */
+ while (start < stop_block) {
+ path = ext4_ext_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) start);
+ return -EIO;
+ }
+
+ current_block = le32_to_cpu(extent->ee_block);
+ if (start > current_block) {
+ /* Hole, move to the next extent */
+ ret = mext_next_extent(inode, path, &extent);
+ if (ret != 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+ ret = ext4_ext_shift_path_extents(path, shift, inode,
+ handle, &start);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t punch_start, punch_stop;
+ handle_t *handle;
+ unsigned int credits;
+ loff_t new_size, ioffset;
+ int ret;
+
+ /* Collapse range works only on fs block size aligned offsets. */
+ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ len & (EXT4_BLOCK_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ trace_ext4_collapse_range(inode, offset, len);
+
+ punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case
+ * it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ ret = -EINVAL;
+ goto out_mutex;
+ }
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ truncate_pagecache(inode, ioffset);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ new_size = i_size_read(inode) - len;
+ i_size_write(inode, new_size);
+ EXT4_I(inode)->i_disksize = new_size;
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff78395..0b7e28e7eaa 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u) %llu %llx",
+ printk(KERN_DEBUG " [%u/%u) %llu %x",
es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
node = rb_next(node);
@@ -344,8 +344,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_status(es1) != ext4_es_status(es2))
return 0;
- if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
+ if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+ pr_warn("ES assertion failed when merging extents. "
+ "The sum of lengths of es1 (%d) and es2 (%d) "
+ "is bigger than allowed file size (%d)\n",
+ es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+ WARN_ON(1);
return 0;
+ }
if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
return 0;
@@ -433,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
ee_start = ext4_ext_pblock(ex);
ee_len = ext4_ext_get_actual_len(ex);
- ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+ ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
es_status = ext4_es_is_unwritten(es) ? 1 : 0;
/*
@@ -445,8 +451,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
pr_warn("ES insert assertion failed for "
"inode: %lu we can find an extent "
"at block [%d/%d/%llu/%c], but we "
- "want to add an delayed/hole extent "
- "[%d/%d/%llu/%llx]\n",
+ "want to add a delayed/hole extent "
+ "[%d/%d/%llu/%x]\n",
inode->i_ino, ee_block, ee_len,
ee_start, ee_status ? 'u' : 'w',
es->es_lblk, es->es_len,
@@ -486,8 +492,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
pr_warn("ES insert assertion failed for inode: %lu "
"can't find an extent at block %d but we want "
- "to add an written/unwritten extent "
- "[%d/%d/%llu/%llx]\n", inode->i_ino,
+ "to add a written/unwritten extent "
+ "[%d/%d/%llu/%x]\n", inode->i_ino,
es->es_lblk, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
}
@@ -524,7 +530,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
*/
pr_warn("ES insert assertion failed for inode: %lu "
"We can find blocks but we want to add a "
- "delayed/hole extent [%d/%d/%llu/%llx]\n",
+ "delayed/hole extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
return;
@@ -554,7 +560,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
if (ext4_es_is_written(es)) {
pr_warn("ES insert assertion failed for inode: %lu "
"We can't find the block but we want to add "
- "an written extent [%d/%d/%llu/%llx]\n",
+ "a written extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
return;
@@ -658,8 +664,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
- ext4_es_store_pblock(&newes, pblk);
- ext4_es_store_status(&newes, status);
+ ext4_es_store_pblock_status(&newes, pblk, status);
trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +704,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
- ext4_es_store_pblock(&newes, pblk);
- ext4_es_store_status(&newes, status);
+ ext4_es_store_pblock_status(&newes, pblk, status);
trace_ext4_es_cache_extent(inode, &newes);
if (!len)
@@ -812,13 +816,13 @@ retry:
newes.es_lblk = end + 1;
newes.es_len = len2;
+ block = 0x7FDEADBEEFULL;
if (ext4_es_is_written(&orig_es) ||
- ext4_es_is_unwritten(&orig_es)) {
+ ext4_es_is_unwritten(&orig_es))
block = ext4_es_pblock(&orig_es) +
orig_es.es_len - len2;
- ext4_es_store_pblock(&newes, block);
- }
- ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ ext4_es_store_pblock_status(&newes, block,
+ ext4_es_status(&orig_es));
err = __es_insert_extent(inode, &newes);
if (err) {
es->es_lblk = orig_es.es_lblk;
@@ -962,10 +966,10 @@ retry:
continue;
}
- if (ei->i_es_lru_nr == 0 || ei == locked_ei)
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+ !write_trylock(&ei->i_es_lock))
continue;
- write_lock(&ei->i_es_lock);
shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
if (ei->i_es_lru_nr == 0)
list_del_init(&ei->i_es_lru);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc..f1b62a41992 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
(es->es_pblk & ~ES_MASK));
}
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ ext4_fsblk_t pb,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (pb & ~ES_MASK));
+}
+
extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3da21945ff1..8695f70af1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-void ext4_unwritten_wait(struct inode *inode)
+static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
@@ -74,132 +74,126 @@ void ext4_unwritten_wait(struct inode *inode)
* or one thread will zero the other's data, causing corruption.
*/
static int
-ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- size_t count = iov_length(iov, nr_segs);
- loff_t final_size = pos + count;
- if (pos >= inode->i_size)
+ if (pos >= i_size_read(inode))
return 0;
- if ((pos & blockmask) || (final_size & blockmask))
+ if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
static ssize_t
-ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int unaligned_aio = 0;
- ssize_t ret;
+ int o_direct = file->f_flags & O_DIRECT;
int overwrite = 0;
- size_t length = iov_length(iov, nr_segs);
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb))
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+ size_t length = iov_iter_count(from);
+ ssize_t ret;
+ loff_t pos = iocb->ki_pos;
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- mutex_lock(ext4_aio_mutex(inode));
+ /*
+ * Unaligned direct AIO must be serialized; see comment above
+ * In the case of O_APPEND, assume that we must always serialize
+ */
+ if (o_direct &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) &&
+ (file->f_flags & O_APPEND ||
+ ext4_unaligned_aio(inode, from, pos))) {
+ aio_mutex = ext4_aio_mutex(inode);
+ mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
}
- BUG_ON(iocb->ki_pos != pos);
-
mutex_lock(&inode->i_mutex);
- blk_start_plug(&plug);
-
- iocb->private = &overwrite;
+ if (file->f_flags & O_APPEND)
+ iocb->ki_pos = pos = i_size_read(inode);
- /* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, len;
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- map.m_lblk = pos >> blkbits;
- map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
- - map.m_lblk;
- len = map.m_len;
+ if ((pos > sbi->s_bitmap_maxbytes) ||
+ (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EFBIG;
+ goto errout;
+ }
- err = ext4_map_blocks(NULL, inode, &map, 0);
- /*
- * 'err==len' means that all of blocks has been preallocated no
- * matter they are initialized or not. For excluding
- * uninitialized extents, we need to check m_flags. There are
- * two conditions that indicate for initialized extents.
- * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
- * 2) If we do a real lookup, non-flags are returned.
- * So we should check these two conditions.
- */
- if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
+ if (pos + length > sbi->s_bitmap_maxbytes)
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0) {
- ssize_t err;
-
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
- ret = err;
- }
- blk_finish_plug(&plug);
+ if (o_direct) {
+ blk_start_plug(&plug);
- if (unaligned_aio)
- mutex_unlock(ext4_aio_mutex(inode));
+ iocb->private = &overwrite;
- return ret;
-}
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
-static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has
+ * been preallocated no matter they are
+ * initialized or not. For excluding
+ * unwritten extents, we need to check
+ * m_flags. There are two conditions that
+ * indicate for initialized extents. 1) If we
+ * hit extent cache, EXT4_MAP_MAPPED flag is
+ * returned; 2) If we do a real lookup,
+ * non-flags are returned. So we should check
+ * these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
+ }
+ }
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- size_t length = iov_length(iov, nr_segs);
+ ret = __generic_file_write_iter(iocb, from);
+ mutex_unlock(&inode->i_mutex);
- if ((pos > sbi->s_bitmap_maxbytes ||
- (pos == sbi->s_bitmap_maxbytes && length > 0)))
- return -EFBIG;
+ if (ret > 0) {
+ ssize_t err;
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
- }
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
}
+ if (o_direct)
+ blk_finish_plug(&plug);
- if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
- ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
- else
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
+errout:
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
return ret;
}
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -243,6 +237,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err) {
ext4_journal_stop(handle);
@@ -592,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
@@ -605,7 +600,7 @@ const struct file_operations ext4_file_operations = {
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
@@ -617,6 +612,7 @@ const struct inode_operations ext4_file_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 137193ff389..5b87fc36aab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *gdp)
{
struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
@@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
@@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -185,6 +196,12 @@ verify:
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, bitmap_blk);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return NULL;
}
@@ -321,6 +338,12 @@ out:
fatal = err;
} else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
+ if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
}
@@ -432,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
grp = hinfo.hash;
} else
- get_random_bytes(&grp, sizeof(grp));
+ grp = prandom_u32();
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
@@ -851,6 +874,13 @@ got:
goto out;
}
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -887,13 +917,6 @@ got:
}
}
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
-
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 594009f5f52..fd69da19482 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return 0;
failed:
for (; i >= 0; i--) {
- if (i != indirect_blks && branch[i].bh)
+ /*
+ * We want to ext4_forget() only freshly allocated indirect
+ * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
+ * buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called.
+ */
+ if (i > 0 && i != indirect_blks && branch[i].bh)
ext4_forget(handle, 1, inode, branch[i].bh,
branch[i].bh->b_blocknr);
ext4_free_blocks(handle, inode, NULL, new_blocks[i],
@@ -639,8 +645,7 @@ out:
* VFS code falls back into buffered path in that case so we are safe.
*/
ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
if (rw == WRITE) {
@@ -687,18 +692,17 @@ retry:
goto locked;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter, offset,
ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iov,
- offset, nr_segs, ext4_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
ext4_truncate_failed_write(inode);
@@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
blk = *i_data;
if (level > 0) {
ext4_lblk_t first2;
+ ext4_lblk_t count2;
+
bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
if (!bh) {
EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
"Read failure");
return -EIO;
}
- first2 = (first > offset) ? first - offset : 0;
+ if (first > offset) {
+ first2 = first - offset;
+ count2 = count;
+ } else {
+ first2 = 0;
+ count2 = count - (offset - first);
+ }
ret = free_hole_blocks(handle, inode, bh,
(__le32 *)bh->b_data, level - 1,
- first2, count - offset,
+ first2, count2,
inode->i_sb->s_blocksize >> 2);
if (ret) {
brelse(bh);
@@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
if (level == 0 ||
(bh && all_zeroes((__le32 *)bh->b_data,
(__le32 *)bh->b_data + addr_per_block))) {
- ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
- *i_data = 0;
+ ext4_free_data(handle, inode, parent_bh,
+ i_data, i_data + 1);
}
brelse(bh);
bh = NULL;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d9ecbf1113a..645205d8ada 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -22,7 +22,7 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
-int ext4_get_inline_size(struct inode *inode)
+static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
return EXT4_I(inode)->i_inline_size;
@@ -211,8 +211,8 @@ out:
* value since it is already handled by ext4_xattr_ibody_inline_set.
* That saves us one memcpy.
*/
-void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
- void *buffer, loff_t pos, unsigned int len)
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
{
struct ext4_xattr_entry *entry;
struct ext4_xattr_ibody_header *header;
@@ -264,6 +264,7 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
return error;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -347,6 +348,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error == -ENODATA)
goto out;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -373,8 +375,8 @@ out:
return error;
}
-int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
{
int ret, size;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -424,6 +426,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (error)
goto out;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -849,15 +852,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
+ int retries;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
goto out;
}
@@ -867,7 +871,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
if (inline_size >= pos + len) {
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_journal;
}
if (ret == -ENOSPC) {
@@ -875,6 +879,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
inode,
flags,
fsdata);
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
goto out;
}
@@ -887,7 +895,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
ret = -ENOMEM;
- goto out;
+ goto out_journal;
}
down_read(&EXT4_I(inode)->xattr_sem);
@@ -904,16 +912,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
up_read(&EXT4_I(inode)->xattr_sem);
*pagep = page;
- handle = NULL;
brelse(iloc.bh);
return 1;
out_release_page:
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
page_cache_release(page);
+out_journal:
+ ext4_journal_stop(handle);
out:
- if (handle)
- ext4_journal_stop(handle);
brelse(iloc.bh);
return ret;
}
@@ -994,17 +1001,16 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned short reclen;
int err;
struct ext4_dir_entry_2 *de;
- reclen = EXT4_DIR_REC_LEN(namelen);
err = ext4_find_dest_de(dir, inode, iloc->bh,
inline_start, inline_size,
name, namelen, &de);
if (err)
return err;
+ BUFFER_TRACE(iloc->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
@@ -1442,6 +1448,7 @@ int ext4_read_inline_dir(struct file *file,
if (ret < 0)
goto out;
+ ret = 0;
sb = inode->i_sb;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
offset = ctx->pos;
@@ -1666,6 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle,
EXT4_MIN_INLINE_DATA_SIZE;
}
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto out;
@@ -1838,7 +1846,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
{
int error;
struct ext4_xattr_entry *entry;
- struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
@@ -1847,7 +1854,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
return error;
raw_inode = ext4_raw_inode(&iloc);
- header = IHDR(inode, raw_inode);
entry = (struct ext4_xattr_entry *)((void *)raw_inode +
EXT4_I(inode)->i_inline_off);
if (EXT4_XATTR_LEN(entry->e_name_len) +
@@ -1925,9 +1931,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
}
/* Clear the content within i_blocks. */
- if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
- memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
- EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+ void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+ memset(p + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ }
EXT4_I(inode)->i_inline_size = i_size <
EXT4_MIN_INLINE_DATA_SIZE ?
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d424d7ac02..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/aio.h>
+#include <linux/bitops.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -144,8 +145,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
+ return 0;
return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
@@ -214,7 +218,7 @@ void ext4_evict_inode(struct inode *inode)
jbd2_complete_transaction(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
@@ -225,7 +229,7 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
@@ -442,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* could be converted.
*/
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -488,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
@@ -503,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
{
struct extent_status es;
int retval;
+ int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -514,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+ /*
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
+ */
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
ext4_es_lru_add(inode);
@@ -543,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -552,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
EXT4_GET_BLOCKS_KEEP_SIZE);
}
if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -579,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -596,7 +610,13 @@ found:
* with buffer head unmapped.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- return retval;
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
* Here we clear m_flags because after allocating an new extent,
@@ -605,12 +625,12 @@ found:
map->m_flags &= ~EXT4_MAP_FLAGS;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
/*
* if the caller is from delayed allocation writeout path
@@ -652,7 +672,6 @@ found:
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -687,7 +706,7 @@ found:
has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -906,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle,
*/
if (dirty)
clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
ret = ext4_journal_get_write_access(handle, bh);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1206,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
@@ -1218,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
/*
* ext4_calc_metadata_amount() has side effects, which we have
@@ -1238,10 +1256,6 @@ repeat:
ei->i_da_metadata_calc_len = save_len;
ei->i_da_metadata_calc_last_lblock = save_last_lblock;
spin_unlock(&ei->i_block_reservation_lock);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- cond_resched();
- goto repeat;
- }
return -ENOSPC;
}
ei->i_reserved_meta_blocks += md_needed;
@@ -1255,7 +1269,6 @@ repeat:
*/
static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
@@ -1277,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
/*
* ext4_calc_metadata_amount() has side effects, which we have
@@ -1297,10 +1309,6 @@ repeat:
ei->i_da_metadata_calc_len = save_len;
ei->i_da_metadata_calc_last_lblock = save_last_lblock;
spin_unlock(&ei->i_block_reservation_lock);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- cond_resched();
- goto repeat;
- }
dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
@@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
goto add_delayed;
}
@@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode)) {
/*
* We will soon create blocks for this page, and let
@@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle));
if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
ret = ext4_journal_get_write_access(handle, inode_bh);
err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
@@ -1784,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;
if (!ext4_has_inline_data(inode))
- ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
@@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page,
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
trace_ext4_writepage(page);
size = i_size_read(inode);
@@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return 0;
}
+ keep_towrite = true;
}
if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
else
len = PAGE_CACHE_SIZE;
clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
* Scan buffers corresponding to changed extent (we expect corresponding pages
* to be already locked) and update buffer state according to new extent state.
* We map delalloc buffers to their physical location, clear unwritten bits,
- * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and mark buffers as uninit when we perform writes to unwritten extents
* and do extent conversion after IO is finished. If the last page is not fully
* mapped, we update @map to the next extent in the last page that needs
* mapping. Otherwise we submit the page for IO.
@@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
struct inode *inode = mpd->inode;
struct ext4_map_blocks *map = &mpd->map;
int get_blocks_flags;
- int err;
+ int err, dioread_nolock;
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
- * to convert an uninitialized extent to be initialized (in the case
+ * to convert an unwritten extent to be initialized (in the case
* where we have written into one or more preallocated blocks). It is
* possible that we're going to need more metadata blocks than
* previously reserved. However we must not fail because we're in
@@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL;
- if (ext4_should_dioread_nolock(inode))
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (map->m_flags & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
@@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
return err;
- if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
if (!mpd->io_submit.io_end->handle &&
ext4_handle_valid(handle)) {
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
@@ -2178,6 +2190,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
*
* @handle - handle for journal operations
* @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
*
* The function maps extent starting at mpd->lblk of length mpd->len. If it is
* delayed, blocks are allocated, if it is unwritten, we may need to convert
@@ -2240,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return err;
} while (map->m_len);
- /* Update on-disk size after IO is submitted */
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
-
- ext4_wb_update_i_disksize(inode, disksize);
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (err2)
ext4_error(inode->i_sb,
"Failed to mark inode %lu dirty",
@@ -2295,6 +2320,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct pagevec pvec;
unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
int tag;
@@ -2330,6 +2356,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (page->index > end)
goto out;
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
+
/* If we can't merge this page, we are done. */
if (mpd->map.m_len > 0 && mpd->next_page != page->index)
goto out;
@@ -2364,19 +2401,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (err <= 0)
goto out;
err = 0;
-
- /*
- * Accumulated enough dirty pages? This doesn't apply
- * to WB_SYNC_ALL mode. For integrity sync we have to
- * keep going because someone may be concurrently
- * dirtying pages, and we might have synced a lot of
- * newly appeared dirty pages, but have not synced all
- * of the old dirty pages.
- */
- if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
- mpd->next_page - mpd->first_page >=
- mpd->wbc->nr_to_write)
- goto out;
+ left--;
}
pagevec_release(&pvec);
cond_resched();
@@ -2420,16 +2445,15 @@ static int ext4_writepages(struct address_space *mapping,
* because that could violate lock ordering on umount
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
+ goto out_writepages;
if (ext4_should_journal_data(inode)) {
struct blk_plug plug;
- int ret;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
- return ret;
+ goto out_writepages;
}
/*
@@ -2442,8 +2466,10 @@ static int ext4_writepages(struct address_space *mapping,
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
- return -EROFS;
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
+ }
if (ext4_should_dioread_nolock(inode)) {
/*
@@ -2563,7 +2589,7 @@ retry:
break;
}
blk_finish_plug(&plug);
- if (!ret && !cycled) {
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
mpd.last_page = writeback_index - 1;
mpd.first_page = 0;
@@ -3052,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
- * For holes, we fallocate those blocks, mark them as uninitialized
+ * For holes, we fallocate those blocks, mark them as unwritten
* If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as uninitialized.
+ * still keep the range to write as unwritten.
*
* The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
@@ -3067,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int overwrite = 0;
get_block_t *get_block_func = NULL;
int dio_flags = 0;
@@ -3082,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
BUG_ON(iocb->private == NULL);
@@ -3106,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* We could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as
- * uninitialized to prevent parallel buffered read to expose
+ * unwritten to prevent parallel buffered read to expose
* the stale data before DIO complete the data IO.
*
* As to previously fallocated extents, ext4 get_block will
* just simply mark the buffer mapped but still keep the
- * extents uninitialized.
+ * extents unwritten.
*
* For non AIO case, we will convert those unwritten extents
* to written after return back from blockdev_direct_IO.
@@ -3149,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
dio_flags = DIO_LOCKING;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter,
+ offset,
get_block_func,
ext4_end_io_dio,
NULL,
@@ -3204,11 +3229,11 @@ retake_lock:
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
/*
@@ -3221,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_has_inline_data(inode))
return 0;
- trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
else
- ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
- trace_ext4_direct_IO_exit(inode, offset,
- iov_length(iov, nr_segs), rw, ret);
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -3320,33 +3344,13 @@ void ext4_set_aops(struct inode *inode)
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
-{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
-
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
-
- return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3436,6 +3440,26 @@ unlock:
return err;
}
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t length)
{
@@ -3509,12 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (EXT4_SB(sb)->s_cluster_ratio > 1) {
- /* TODO: Add support for bigalloc file systems */
- return -EOPNOTSUPP;
- }
-
- trace_ext4_punch_hole(inode, offset, length);
+ trace_ext4_punch_hole(inode, offset, length, 0);
/*
* Write out all dirty pages to avoid race conditions
@@ -3528,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- ret = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_mutex;
- }
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -3617,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ret = ext4_free_hole_blocks(handle, inode, first_block,
stop_block);
- ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
@@ -3694,7 +3709,7 @@ void ext4_truncate(struct inode *inode)
/*
* There is a possibility that we're either freeing the inode
- * or it completely new indode. In those cases we might not
+ * or it's a completely new inode. In those cases we might not
* have i_mutex locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4164,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
ret = 0;
@@ -4291,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
- int need_datasync = 0;
+ int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
- /* For fields not not tracking in the in-memory inode,
+ spin_lock(&ei->i_raw_lock);
+
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -4334,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4348,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- ext4_handle_sync(handle);
- err = ext4_handle_dirty_super(handle, sb);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -4384,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_block[block] = ei->i_data[block];
}
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
ext4_inode_csum_set(inode, raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
-
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
@@ -4412,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
* transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -4438,15 +4462,15 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -4456,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -4466,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
@@ -4594,6 +4627,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
+
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
+
if (S_ISREG(inode->i_mode) &&
(attr->ia_size < inode->i_size)) {
if (ext4_should_order_data(inode)) {
@@ -4671,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -4690,6 +4727,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
generic_fillattr(inode, stat);
/*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
* We can't update i_blocks if the block allocation is delayed
* otherwise in the case of system crash before the real block
* allocation is done, we will have i_blocks inconsistent with
@@ -4700,9 +4746,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* blocks for this file.
*/
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
- EXT4_I(inode)->i_reserved_data_blocks);
-
- stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
return 0;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a569d335f80..0f2252ec274 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,28 +101,18 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle_t *handle;
int err;
struct inode *inode_bl;
- struct ext4_inode_info *ei;
struct ext4_inode_info *ei_bl;
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
- err = -EINVAL;
- goto swap_boot_out;
- }
-
- if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto swap_boot_out;
- }
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+ return -EINVAL;
- sbi = EXT4_SB(sb);
- ei = EXT4_I(inode);
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+ return -EPERM;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
- if (IS_ERR(inode_bl)) {
- err = PTR_ERR(inode_bl);
- goto swap_boot_out;
- }
+ if (IS_ERR(inode_bl))
+ return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
filemap_flush(inode->i_mapping);
@@ -130,7 +120,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
- ext4_inode_double_lock(inode, inode_bl);
+ lock_two_nondirectories(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
@@ -144,7 +134,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto swap_boot_out;
+ goto journal_err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -197,19 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_mark_inode_dirty(handle, inode);
}
}
-
ext4_journal_stop(handle);
-
ext4_double_up_write_data_sem(inode, inode_bl);
+journal_err_out:
ext4_inode_resume_unlocked_dio(inode);
ext4_inode_resume_unlocked_dio(inode_bl);
-
- ext4_inode_double_unlock(inode, inode_bl);
-
+ unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
-
-swap_boot_out:
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a41e3ba8cfa..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
@@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u clusters in bitmap, %u in gd; "
- "block bitmap corrupt.",
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
free, grp->bb_free);
/*
* If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -989,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1003,7 +1007,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
pnum = block / blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_buddy_page = page;
return 0;
@@ -1044,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
* would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
if (e4b.bd_buddy_page == NULL) {
/*
@@ -1082,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
ext4_mb_put_buddy_page_lock(&e4b);
return ret;
@@ -1141,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1168,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1197,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1421,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1431,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
/* Mark the block group as corrupt. */
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
&e4b->bd_info->bb_state);
@@ -1808,6 +1826,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
@@ -1936,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1977,6 +1996,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -2607,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out_free_groupinfo_slab;
+ goto out;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -2632,8 +2652,6 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
-out_free_groupinfo_slab:
- ext4_groupinfo_destroy_slabs();
out:
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
@@ -2866,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -2878,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
ext4_free_group_clusters(sb, gdp));
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
@@ -3135,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
- BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
/* now prepare goal request */
@@ -3442,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3455,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_group_t grp;
ext4_fsblk_t grp_blk;
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
-
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -4001,8 +4026,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
- ac->ac_ex_scanned, ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
@@ -4121,7 +4145,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
@@ -4663,7 +4687,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
* blocks at the beginning or the end unless we are explicitly
* requested to avoid doing so.
*/
- overflow = block & (sbi->s_cluster_ratio - 1);
+ overflow = EXT4_PBLK_COFF(sbi, block);
if (overflow) {
if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
overflow = sbi->s_cluster_ratio - overflow;
@@ -4677,7 +4701,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
count += overflow;
}
}
- overflow = count & (sbi->s_cluster_ratio - 1);
+ overflow = EXT4_LBLK_COFF(sbi, count);
if (overflow) {
if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
if (count > overflow)
@@ -4794,8 +4818,8 @@ do_more:
" group:%d block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
- }
-
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -5002,6 +5026,8 @@ error_return:
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
int ret = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd..d634e183b4d 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
} \
} while (0)
#else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
- /* number of iterations done. we have to track to limit searching */
- unsigned long ac_ex_scanned;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_tail;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 2ae73a80c19..ec092437d3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -505,7 +505,7 @@ int ext4_ext_migrate(struct inode *inode)
* with i_data_sem held to prevent racing with block
* allocation.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 214461e42a0..32bce844c2e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -18,7 +18,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
return cpu_to_le32(csum);
}
-int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -27,7 +27,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
}
-void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -259,7 +259,7 @@ static unsigned int mmp_new_seq(void)
u32 new_seq;
do {
- get_random_bytes(&new_seq, sizeof(u32));
+ new_seq = prandom_u32();
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7fa4d855dbd..2484c7ec6a7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
static void
copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
{
- if (ext4_ext_is_uninitialized(src))
- ext4_ext_mark_uninitialized(dest);
+ if (ext4_ext_is_unwritten(src))
+ ext4_ext_mark_unwritten(dest);
else
dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
}
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
* ext4_ext_path structure refers to the last extent, or a negative error
* value on failure.
*/
-static int
+int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
@@ -391,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
if (depth) {
/* Register to journal */
+ BUFFER_TRACE(orig_path->p_bh, "get_write_access");
ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
if (ret)
return ret;
@@ -593,14 +594,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @inode: inode in question
* @from: block offset of inode
* @count: block count to be checked
- * @uninit: extents expected to be uninitialized
+ * @unwritten: extents expected to be unwritten
* @err: pointer to save error value
*
* Return 1 if all extents in range has expected type, and zero otherwise.
*/
static int
mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
- int uninit, int *err)
+ int unwritten, int *err)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent *ext;
@@ -611,7 +612,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (*err)
goto out;
ext = path[ext_depth(inode)].p_ext;
- if (uninit != ext4_ext_is_uninitialized(ext))
+ if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
ext4_ext_drop_refs(path);
@@ -861,8 +862,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
}
if (!buffer_mapped(bh)) {
zero_user(page, block_start, blocksize);
- if (!err)
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
continue;
}
}
@@ -895,7 +895,7 @@ out:
* @orig_page_offset: page index on original file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
- * @uninit: orig extent is uninitialized or not
+ * @unwritten: orig extent is unwritten or not
* @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
@@ -906,7 +906,7 @@ out:
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit, int *err)
+ int block_len_in_page, int unwritten, int *err)
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
@@ -963,27 +963,27 @@ again:
if (unlikely(*err < 0))
goto stop_journal;
/*
- * If orig extent was uninitialized it can become initialized
+ * If orig extent was unwritten it can become initialized
* at any time after i_data_sem was dropped, in order to
* serialize with delalloc we have recheck extent while we
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
- if (uninit) {
+ if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
- uninit = mext_check_coverage(orig_inode, orig_blk_offset,
- block_len_in_page, 1, err);
+ unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
- uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
- block_len_in_page, 1, err);
+ unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
- if (!uninit) {
+ if (!unwritten) {
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
@@ -1203,42 +1203,6 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
- * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode structure
- * @inode2: the inode structure
- *
- * Lock two inodes' i_mutex
- */
-void
-ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
- BUG_ON(inode1 == inode2);
- if (inode1 < inode2) {
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
- }
-}
-
-/**
- * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode that is released first
- * @inode2: the inode that is released second
- *
- */
-
-void
-ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
- mutex_unlock(&inode1->i_mutex);
- mutex_unlock(&inode2->i_mutex);
-}
-
-/**
* ext4_move_extents - Exchange the specified range of a file
*
* @o_filp: file structure of the original file
@@ -1296,7 +1260,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
- int uninit;
+ int unwritten;
if (orig_inode->i_sb != donor_inode->i_sb) {
ext4_debug("ext4 move extent: The argument files "
@@ -1327,7 +1291,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
- ext4_inode_double_lock(orig_inode, donor_inode);
+ lock_two_nondirectories(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
@@ -1428,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
!last_extent)
continue;
- /* Is original extent is uninitialized */
- uninit = ext4_ext_is_uninitialized(ext_prev);
+ /* Is original extent is unwritten */
+ unwritten = ext4_ext_is_unwritten(ext_prev);
data_offset_in_page = seq_start % blocks_per_page;
@@ -1469,8 +1433,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit,
- &ret);
+ block_len_in_page,
+ unwritten, &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
@@ -1535,7 +1499,7 @@ out:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
- ext4_inode_double_unlock(orig_inode, donor_inode);
+ unlock_two_nondirectories(orig_inode, donor_inode);
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1bec5a5c1e4..3520ab8a663 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -67,6 +67,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(err);
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
@@ -1425,9 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EIO);
}
if (unlikely(ino == dir->i_ino)) {
- EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
- dentry->d_name.len,
- dentry->d_name.name);
+ EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+ dentry);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
@@ -1779,6 +1779,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+ BUFFER_TRACE(bh, "get_write_access");
retval = ext4_journal_get_write_access(handle, bh);
if (retval) {
ext4_std_error(dir->i_sb, retval);
@@ -2319,7 +2320,7 @@ retry:
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
if (err)
- goto err_drop_inode;
+ goto err_unlock_inode;
mark_inode_dirty(inode);
unlock_new_inode(inode);
}
@@ -2328,10 +2329,9 @@ retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
-err_drop_inode:
+err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2512,8 +2512,7 @@ static int empty_dir(struct inode *inode)
ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
- if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
unsigned int lblock;
err = 0;
brelse(bh);
@@ -2541,26 +2540,37 @@ static int empty_dir(struct inode *inode)
return 1;
}
-/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
* such inodes, starting at the superblock, in case we crash before the
* file is closed/deleted, or in case the inode truncate spans multiple
* transactions and the last transaction is not recovered after a crash.
*
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_iloc iloc;
int err = 0, rc;
+ bool dirty = false;
- if (!EXT4_SB(sb)->s_journal)
+ if (!sbi->s_journal)
return 0;
- mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /*
+ * Exit early if inode already is on orphan list. This is a big speedup
+ * since we don't have to contend on the global s_orphan_lock.
+ */
if (!list_empty(&EXT4_I(inode)->i_orphan))
- goto out_unlock;
+ return 0;
/*
* Orphan handling is only valid for files with data blocks
@@ -2571,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
- goto out_unlock;
+ goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- goto out_unlock;
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
/*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
- if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
- goto mem_insert;
-
- /* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
- EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_super(handle, sb);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
-
- /* Only add to the head of the in-memory list if all the
- * previous operations succeeded. If the orphan_add is going to
- * fail (possibly taking the journal offline), we can't risk
- * leaving the inode on the orphan list: stray orphan-list
- * entries can cause panics at unmount time.
- *
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
-mem_insert:
- if (!err)
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_super(handle, sb);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ }
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
- mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
- ext4_std_error(inode->i_sb, err);
+out:
+ ext4_std_error(sb, err);
return err;
}
@@ -2624,45 +2637,51 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 ino_next;
struct ext4_iloc iloc;
int err = 0;
- if ((!EXT4_SB(inode->i_sb)->s_journal) &&
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /* Do this quick check before taking global s_orphan_lock. */
if (list_empty(&ei->i_orphan))
- goto out;
+ return 0;
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
- sbi = EXT4_SB(inode->i_sb);
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+ mutex_lock(&sbi->s_orphan_lock);
jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
/* If we're on an error path, we may not have a valid
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (!handle)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_err;
+ }
+ ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ mutex_unlock(&sbi->s_orphan_lock);
err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
@@ -2672,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
jbd_debug(4, "orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
NEXT_ORPHAN(i_prev) = ino_next;
err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
}
if (err)
goto out_brelse;
NEXT_ORPHAN(inode) = 0;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
out_err:
ext4_std_error(inode->i_sb, err);
-out:
- mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
return err;
out_brelse:
@@ -3002,6 +3021,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
return ext4_get_first_inline_block(inode, parent_de, retval);
}
+struct ext4_renament {
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool is_dir;
+ int dir_nlink_delta;
+
+ /* entry for "dentry" */
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ int inlined;
+
+ /* entry for ".." in inode if it's a directory */
+ struct buffer_head *dir_bh;
+ struct ext4_dir_entry_2 *parent_de;
+ int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+
+ ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+ &retval, &ent->parent_de,
+ &ent->dir_inlined);
+ if (!ent->dir_bh)
+ return retval;
+ if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+ return -EIO;
+ BUFFER_TRACE(ent->dir_bh, "get_write_access");
+ return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+ unsigned dir_ino)
+{
+ int retval;
+
+ ent->parent_de->inode = cpu_to_le32(dir_ino);
+ BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+ if (!ent->dir_inlined) {
+ if (is_dx(ent->inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ }
+ } else {
+ retval = ext4_mark_inode_dirty(handle, ent->inode);
+ }
+ if (retval) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ int retval;
+
+ BUFFER_TRACE(ent->bh, "get write access");
+ retval = ext4_journal_get_write_access(handle, ent->bh);
+ if (retval)
+ return retval;
+ ent->de->inode = cpu_to_le32(ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ ent->de->file_type = file_type;
+ ent->dir->i_version++;
+ ent->dir->i_ctime = ent->dir->i_mtime =
+ ext4_current_time(ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+ if (!ent->inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->dir, ent->bh);
+ if (unlikely(retval)) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ }
+ brelse(ent->bh);
+ ent->bh = NULL;
+
+ return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+ const struct qstr *d_name)
+{
+ int retval = -ENOENT;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (bh) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ brelse(bh);
+ }
+ return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+ /*
+ * ent->de could have moved from under us during htree split, so make
+ * sure that we are deleting the right entry. We might also be pointing
+ * to a stale entry in the unused part of ent->bh so just checking inum
+ * and the name isn't enough.
+ */
+ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+ ent->de->name_len != ent->dentry->d_name.len ||
+ strncmp(ent->de->name, ent->dentry->d_name.name,
+ ent->de->name_len)) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ } else {
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ if (retval == -ENOENT) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ }
+ }
+
+ if (retval) {
+ ext4_warning(ent->dir->i_sb,
+ "Deleting old file (%lu), %d, error=%d",
+ ent->dir->i_ino, ent->dir->i_nlink, retval);
+ }
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+ if (ent->dir_nlink_delta) {
+ if (ent->dir_nlink_delta == -1)
+ ext4_dec_count(handle, ent->dir);
+ else
+ ext4_inc_count(handle, ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ }
+}
+
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
@@ -3014,198 +3181,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
handle_t *handle = NULL;
- struct inode *old_inode, *new_inode;
- struct buffer_head *old_bh, *new_bh, *dir_bh;
- struct ext4_dir_entry_2 *old_de, *new_de;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
int retval;
- int inlined = 0, new_inlined = 0;
- struct ext4_dir_entry_2 *parent_de;
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
-
- old_bh = new_bh = dir_bh = NULL;
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new_dentry->d_inode)
- dquot_initialize(new_dentry->d_inode);
+ if (new.inode)
+ dquot_initialize(new.inode);
- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- old_inode = old_dentry->d_inode;
retval = -ENOENT;
- if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
- new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
- &new_de, &new_inlined);
- if (new_bh) {
- if (!new_inode) {
- brelse(new_bh);
- new_bh = NULL;
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+ if (new.bh) {
+ if (!new.inode) {
+ brelse(new.bh);
+ new.bh = NULL;
}
}
- if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- ext4_alloc_da_blocks(old_inode);
+ if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old.inode);
- handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
- if (S_ISDIR(old_inode->i_mode)) {
- if (new_inode) {
+ if (S_ISDIR(old.inode->i_mode)) {
+ if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir(new_inode))
+ if (!empty_dir(new.inode))
+ goto end_rename;
+ } else {
+ retval = -EMLINK;
+ if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = -EIO;
- dir_bh = ext4_get_first_dir_block(handle, old_inode,
- &retval, &parent_de,
- &inlined);
- if (!dir_bh)
- goto end_rename;
- if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
- if (!new_inode && new_dir != old_dir &&
- EXT4_DIR_LINK_MAX(new_dir))
- goto end_rename;
- BUFFER_TRACE(dir_bh, "get_write_access");
- retval = ext4_journal_get_write_access(handle, dir_bh);
+ retval = ext4_rename_dir_prepare(handle, &old);
if (retval)
goto end_rename;
}
- if (!new_bh) {
- retval = ext4_add_entry(handle, new_dentry, old_inode);
+ if (!new.bh) {
+ retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
- BUFFER_TRACE(new_bh, "get write access");
- retval = ext4_journal_get_write_access(handle, new_bh);
+ retval = ext4_setent(handle, &new,
+ old.inode->i_ino, old.de->file_type);
if (retval)
goto end_rename;
- new_de->inode = cpu_to_le32(old_inode->i_ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
- new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime =
- ext4_current_time(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
- BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- if (!new_inlined) {
- retval = ext4_handle_dirty_dirent_node(handle,
- new_dir, new_bh);
- if (unlikely(retval)) {
- ext4_std_error(new_dir->i_sb, retval);
- goto end_rename;
- }
- }
- brelse(new_bh);
- new_bh = NULL;
}
/*
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
- ext4_mark_inode_dirty(handle, old_inode);
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
/*
* ok, that's it
*/
- if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
- old_de->name_len != old_dentry->d_name.len ||
- strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
- (retval = ext4_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
- /* old_de could have moved from under us during htree split, so
- * make sure that we are deleting the right entry. We might
- * also be pointing to a stale entry in the unused part of
- * old_bh so just checking inum and the name isn't enough. */
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
-
- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
- &old_de2, NULL);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
- brelse(old_bh2);
- }
+ ext4_rename_delete(handle, &old);
+
+ if (new.inode) {
+ ext4_dec_count(handle, new.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
}
- if (retval) {
- ext4_warning(old_dir->i_sb,
- "Deleting old file (%lu), %d, error=%d",
- old_dir->i_ino, old_dir->i_nlink, retval);
- }
-
- if (new_inode) {
- ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
- }
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
- ext4_update_dx_flag(old_dir);
- if (dir_bh) {
- parent_de->inode = cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- if (!inlined) {
- if (is_dx(old_inode)) {
- retval = ext4_handle_dirty_dx_node(handle,
- old_inode,
- dir_bh);
- } else {
- retval = ext4_handle_dirty_dirent_node(handle,
- old_inode, dir_bh);
- }
- } else {
- retval = ext4_mark_inode_dirty(handle, old_inode);
- }
- if (retval) {
- ext4_std_error(old_dir->i_sb, retval);
+ old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ ext4_update_dx_flag(old.dir);
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
goto end_rename;
- }
- ext4_dec_count(handle, old_dir);
- if (new_inode) {
+
+ ext4_dec_count(handle, old.dir);
+ if (new.inode) {
/* checked empty_dir above, can't have another parent,
* ext4_dec_count() won't work for many-linked dirs */
- clear_nlink(new_inode);
+ clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new_dir);
- ext4_update_dx_flag(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
+ ext4_inc_count(handle, new.dir);
+ ext4_update_dx_flag(new.dir);
+ ext4_mark_inode_dirty(handle, new.dir);
}
}
- ext4_mark_inode_dirty(handle, old_dir);
- if (new_inode) {
- ext4_mark_inode_dirty(handle, new_inode);
- if (!new_inode->i_nlink)
- ext4_orphan_add(handle, new_inode);
+ ext4_mark_inode_dirty(handle, old.dir);
+ if (new.inode) {
+ ext4_mark_inode_dirty(handle, new.inode);
+ if (!new.inode->i_nlink)
+ ext4_orphan_add(handle, new.inode);
+ }
+ retval = 0;
+
+end_rename:
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
+ u8 new_file_type;
+ int retval;
+
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
+
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+ &old.de, &old.inlined);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ retval = -ENOENT;
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+ goto end_rename;
+
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+
+ /* RENAME_EXCHANGE case: old *and* new must both exist */
+ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+ goto end_rename;
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ old.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
+ goto end_rename;
+ }
+ if (S_ISDIR(new.inode->i_mode)) {
+ new.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &new);
+ if (retval)
+ goto end_rename;
}
+
+ /*
+ * Other than the special case of overwriting a directory, parents'
+ * nlink only needs to be modified if this is a cross directory rename.
+ */
+ if (old.dir != new.dir && old.is_dir != new.is_dir) {
+ old.dir_nlink_delta = old.is_dir ? -1 : 1;
+ new.dir_nlink_delta = -old.dir_nlink_delta;
+ retval = -EMLINK;
+ if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+ (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+ goto end_rename;
+ }
+
+ new_file_type = new.de->file_type;
+ retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
+
+ retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+ if (retval)
+ goto end_rename;
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
+ ext4_mark_inode_dirty(handle, new.inode);
+
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ if (new.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ ext4_update_dir_count(handle, &old);
+ ext4_update_dir_count(handle, &new);
retval = 0;
end_rename:
- brelse(dir_bh);
- brelse(old_bh);
- brelse(new_bh);
+ brelse(old.dir_bh);
+ brelse(new.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
if (handle)
ext4_journal_stop(handle);
return retval;
}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return ext4_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+ /*
+ * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+ * is equivalent to regular rename.
+ */
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
/*
* directories can handle most operations...
*/
@@ -3220,12 +3456,14 @@ const struct inode_operations ext4_dir_inode_operations = {
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename,
+ .rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
@@ -3236,4 +3474,5 @@ const struct inode_operations ext4_special_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d7d0c7b46ed..b24a2541a9b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct bio_vec *bvec = &bio->bi_io_vec[i];
+ bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
@@ -197,14 +197,15 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
struct workqueue_struct *wq;
unsigned long flags;
/* Only reserved conversions from writeback should enter here */
WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle);
+ WARN_ON(!io_end->handle && sbi->s_journal);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
queue_work(wq, &ei->i_rsv_conversion_work);
list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
@@ -297,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- sector_t bi_sector = bio->bi_sector;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
BUG_ON(!io_end);
bio->bi_end_io = NULL;
@@ -307,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)
if (error) {
struct inode *inode = io_end->inode;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- inode->i_ino,
+ error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ mapping_set_error(inode->i_mapping, error);
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
@@ -365,7 +367,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
if (!bio)
return -ENOMEM;
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
@@ -399,7 +401,8 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ bool keep_towrite)
{
struct inode *inode = page->mapping->host;
unsigned block_start, blocksize;
@@ -412,10 +415,24 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- set_page_writeback(page);
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
ClearPageError(page);
/*
+ * Comments copied from block_write_full_page:
+ *
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ if (len < PAGE_CACHE_SIZE)
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ /*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
* end_page_writeback() cannot be called from ext4_bio_end_io() when IO
@@ -426,19 +443,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
block_start = bh_offset(bh);
if (block_start >= len) {
- /*
- * Comments copied from block_write_full_page_endio:
- *
- * The page straddles i_size. It must be zeroed out on
- * each and every writepage invocation because it may
- * be mmapped. "A file is mapped in multiples of the
- * page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when
- * mapped, and writes to that region are not written
- * out to the file."
- */
- zero_user_segment(page, block_start,
- block_start + blocksize);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c5adbb318a9..bb0e80f03e2 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb)
void ext4_resize_end(struct super_block *sb)
{
clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
}
static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
ext4_group_t group;
ext4_group_t last_group;
unsigned overhead;
+ __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
BUG_ON(flex_gd->count == 0 || group_data == NULL);
@@ -266,7 +267,7 @@ next_group:
src_group++;
for (; src_group <= last_group; src_group++) {
overhead = ext4_group_overhead_blocks(sb, src_group);
- if (overhead != 0)
+ if (overhead == 0)
last_blk += group_data[src_group - group].blocks_count;
else
break;
@@ -280,8 +281,7 @@ next_group:
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ flex_gd->bg_flags[group] &= uninit_mask;
}
/* Allocate inode bitmaps */
@@ -292,22 +292,30 @@ next_group:
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ flex_gd->bg_flags[group] &= uninit_mask;
}
/* Allocate inode tables */
for (; it_index < flex_gd->count; it_index++) {
- if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+ unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+ ext4_fsblk_t next_group_start;
+
+ if (start_blk + itb > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
- group = ext4_get_group_number(sb, start_blk - 1);
+ group = ext4_get_group_number(sb, start_blk);
+ next_group_start = ext4_group_first_block_no(sb, group + 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count -=
- EXT4_SB(sb)->s_itb_per_group;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ if (start_blk + itb > next_group_start) {
+ flex_gd->bg_flags[group + 1] &= uninit_mask;
+ overhead = start_blk + itb - next_group_start;
+ group_data[group + 1].free_blocks_count -= overhead;
+ itb -= overhead;
+ }
+
+ group_data[group].free_blocks_count -= itb;
+ flex_gd->bg_flags[group] &= uninit_mask;
start_blk += EXT4_SB(sb)->s_itb_per_group;
}
@@ -340,6 +348,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
bh = sb_getblk(sb, blk);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -401,7 +410,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
- count2 = sb->s_blocksize * 8 - (block - start);
+ count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
if (count2 > count)
count2 = count;
@@ -418,6 +427,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
if (unlikely(!bh))
return -ENOMEM;
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
return err;
@@ -510,6 +520,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
}
+ BUFFER_TRACE(gdb, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb);
if (err) {
brelse(gdb);
@@ -620,7 +631,7 @@ handle_ib:
if (err)
goto out;
count = group_table_count[j];
- start = group_data[i].block_bitmap;
+ start = (&group_data[i].block_bitmap)[j];
block = start;
}
@@ -782,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
goto exit_dind;
}
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (unlikely(err))
goto exit_dind;
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
goto exit_dind;
+ BUFFER_TRACE(dind, "get_write_access");
err = ext4_journal_get_write_access(handle, dind);
if (unlikely(err))
ext4_std_error(sb, err);
@@ -894,6 +908,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
ext4_kvfree(o_group_desc);
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
brelse(gdb_bh);
@@ -969,6 +984,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
+ BUFFER_TRACE(primary[i], "get_write_access");
if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
}
@@ -1076,6 +1092,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
ext4_debug("update metadata backup %llu(+%llu)\n",
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -1155,6 +1172,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
*/
if (gdb_off) {
gdb_bh = sbi->s_group_desc[gdb_num];
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
@@ -1425,6 +1443,7 @@ static int ext4_flex_group_add(struct super_block *sb,
goto exit;
}
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto exit_journal;
@@ -1637,6 +1656,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
return err;
}
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err) {
ext4_warning(sb, "error %d on journal write access", err);
@@ -1796,6 +1816,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
if (IS_ERR(handle))
return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto errout;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c2e6cbc6be..6df7bc611db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
@@ -137,8 +138,8 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
return cpu_to_le32(csum);
}
-int ext4_superblock_csum_verify(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -411,20 +412,26 @@ static void ext4_handle_error(struct super_block *sb)
sb->s_id);
}
+#define ext4_error_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \
+ "EXT4-fs error")
+
void __ext4_error(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
- sb->s_id, function, line, current->comm, &vaf);
- va_end(args);
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+ sb->s_id, function, line, current->comm, &vaf);
+ va_end(args);
+ }
save_error_info(sb, function, line);
-
ext4_handle_error(sb);
}
@@ -438,22 +445,23 @@ void __ext4_error_inode(struct inode *inode, const char *function,
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
es->s_last_error_block = cpu_to_le64(block);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (block)
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
- "inode #%lu: block %llu: comm %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- block, current->comm, &vaf);
- else
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
- "inode #%lu: comm %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- current->comm, &vaf);
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
@@ -469,27 +477,28 @@ void __ext4_error_file(struct file *file, const char *function,
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- path = d_path(&(file->f_path), pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (block)
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu: "
- "block %llu: comm %s: path %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- block, current->comm, path, &vaf);
- else
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu: "
- "comm %s: path %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- current->comm, path, &vaf);
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
@@ -543,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function,
(sb->s_flags & MS_RDONLY))
return;
- errstr = ext4_decode_error(sb, errno, nbuf);
- printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
- save_error_info(sb, function, line);
+ if (ext4_error_ratelimit(sb)) {
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -597,6 +608,9 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -610,6 +624,10 @@ void __ext4_warning(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning"))
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -633,18 +651,20 @@ __acquires(bitlock)
es->s_last_error_block = cpu_to_le64(block);
__save_error_info(sb, function, line);
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
- sb->s_id, function, line, grp);
- if (ino)
- printk(KERN_CONT "inode %lu: ", ino);
- if (block)
- printk(KERN_CONT "block %llu:", (unsigned long long) block);
- printk(KERN_CONT "%pV\n", &vaf);
- va_end(args);
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+ sb->s_id, function, line, grp);
+ if (ino)
+ printk(KERN_CONT "inode %lu: ", ino);
+ if (block)
+ printk(KERN_CONT "block %llu:",
+ (unsigned long long) block);
+ printk(KERN_CONT "%pV\n", &vaf);
+ va_end(args);
+ }
if (test_opt(sb, ERRORS_CONT)) {
ext4_commit_super(sb, 0);
@@ -773,7 +793,7 @@ static void ext4_put_super(struct super_block *sb)
}
ext4_es_unregister_shrinker(sbi);
- del_timer(&sbi->s_err_report);
+ del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
@@ -826,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
@@ -855,6 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
+ spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
@@ -921,7 +946,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
@@ -1500,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_max_batch_time) {
- if (arg == 0)
- arg = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_max_batch_time = arg;
} else if (token == Opt_min_batch_time) {
sbi->s_min_batch_time = arg;
@@ -1879,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (!(sbi->s_mount_state & EXT4_VALID_FS))
ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
"running e2fsck is recommended");
- else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+ else if (sbi->s_mount_state & EXT4_ERROR_FS)
ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, "
"running e2fsck is recommended");
@@ -2380,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
if (ext4_bg_has_super(sb, bg))
has_super = 1;
+ /*
+ * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+ * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled
+ * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+ * compensate.
+ */
+ if (sb->s_blocksize == 1024 && nr == 0 &&
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+ has_super++;
+
return (has_super + ext4_group_first_block_no(sb, bg));
}
@@ -2606,6 +2639,12 @@ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2623,6 +2662,12 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
NULL,
};
@@ -2762,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg)
es = sbi->s_es;
if (es->s_error_count)
- ext4_msg(sb, KERN_NOTICE, "error count: %u",
+ /* fsck newer than v1.41.13 is needed to clean this condition. */
+ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
le32_to_cpu(es->s_error_count));
if (es->s_first_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_first_error_time),
(int) sizeof(es->s_first_error_func),
es->s_first_error_func,
@@ -2779,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg)
printk("\n");
}
if (es->s_last_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_last_error_time),
(int) sizeof(es->s_last_error_func),
es->s_last_error_func,
@@ -3037,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
- unsigned long rnd;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
if (!elr)
@@ -3052,10 +3097,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- get_random_bytes(&rnd, sizeof(rnd));
- elr->lr_next_sched = jiffies + (unsigned long)rnd %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ);
-
+ elr->lr_next_sched = jiffies + (prandom_u32() %
+ (EXT4_DEF_LI_MAX_START_DELAY * HZ));
return elr;
}
@@ -3288,19 +3331,28 @@ int ext4_calculate_overhead(struct super_block *sb)
}
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
/*
+ * There's no need to reserve anything when we aren't using extents.
+ * The space estimates are exact, there are no unwritten extents,
+ * hole punching doesn't need new metadata... This is needed especially
+ * to keep ext2/3 backward compatibility.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ return 0;
+ /*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
* out of space like for example punch hole, or converting
- * uninitialized extents in delalloc path. In most cases such
+ * unwritten extents in delalloc path. In most cases such
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
- resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+ resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+ EXT4_SB(sb)->s_cluster_bits;
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -3538,6 +3590,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
+ if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+ set_opt2(sb, HURD_COMPAT);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "The Hurd can't support 64-bit file systems");
+ goto failed_mount;
+ }
+ }
+
if (IS_EXT2_SB(sb)) {
if (ext2_feature_set_ok(sb))
ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -3658,16 +3720,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- i = le32_to_cpu(es->s_flags);
- if (i & EXT2_FLAGS_UNSIGNED_HASH)
- sbi->s_hash_unsigned = 3;
- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
- sbi->s_hash_unsigned = 3;
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
#else
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
+ }
}
/* Handle clustersize */
@@ -3967,6 +4035,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
no_journal:
+ if (ext4_mballoc_ready) {
+ sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
+ }
+ }
+
/*
* Get the # of file system overhead blocks from the
* superblock if present.
@@ -4043,10 +4119,10 @@ no_journal:
"available");
}
- err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
if (err) {
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- "reserved pool", ext4_calculate_resv_clusters(sbi));
+ "reserved pool", ext4_calculate_resv_clusters(sb));
goto failed_mount4a;
}
@@ -4118,6 +4194,11 @@ no_journal:
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
kfree(orig_data);
return 0;
@@ -4151,7 +4232,7 @@ failed_mount_wq:
}
failed_mount3:
ext4_es_unregister_shrinker(sbi);
- del_timer(&sbi->s_err_report);
+ del_timer_sync(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeclusters_counter);
@@ -4787,6 +4868,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
+ err = sync_filesystem(sb);
+ if (err < 0)
+ goto restore_opts;
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
@@ -5293,6 +5377,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
bh = ext4_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
+ BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
@@ -5468,11 +5553,9 @@ static int __init ext4_init_fs(void)
err = ext4_init_mballoc();
if (err)
- goto out3;
-
- err = ext4_init_xattr();
- if (err)
goto out2;
+ else
+ ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
@@ -5488,10 +5571,9 @@ out:
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_exit_xattr();
-out2:
+ ext4_mballoc_ready = 0;
ext4_exit_mballoc();
-out3:
+out2:
ext4_exit_feat_adverts();
out4:
if (ext4_proc_root)
@@ -5514,7 +5596,6 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
- ext4_exit_xattr();
ext4_exit_mballoc();
ext4_exit_feat_adverts();
remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c081e34f717..e7387337060 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct ext4_xattr_header *,
struct mb_cache_entry **);
@@ -90,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
static int ext4_xattr_list(struct dentry *dentry, char *buffer,
size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
-
static const struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
- [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -108,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- &ext4_xattr_acl_access_handler,
- &ext4_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
+#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_mb_cache)
+
static __le32 ext4_xattr_block_csum(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct ext4_xattr_entry *entry;
size_t size;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
if (error == -EIO)
@@ -367,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
+ if (strlen(name) > 255)
+ return -ERANGE;
+
down_read(&EXT4_I(inode)->xattr_sem);
error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size);
@@ -409,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -430,7 +436,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
@@ -510,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
return;
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
ext4_handle_dirty_super(handle, sb);
@@ -517,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
}
/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -526,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
{
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+ BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
@@ -538,16 +547,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
+ unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
- unlock_buffer(bh);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
if (ce)
mb_cache_entry_release(ce);
+ /*
+ * Beware of this ugliness: Releasing of xattr block references
+ * from different inodes can race and so we have to protect
+ * from a race where someone else frees the block (and releases
+ * its journal_head) before we are done dirtying the buffer. In
+ * nojournal mode this race is harmless and we actually cannot
+ * call ext4_handle_dirty_xattr_block() with locked buffer as
+ * that function can call sync_dirty_buffer() so for that case
+ * we handle the dirtying after unlocking the buffer.
+ */
+ if (ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
unlock_buffer(bh);
- error = ext4_handle_dirty_xattr_block(handle, inode, bh);
+ if (!ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
@@ -567,12 +591,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- *total += EXT4_XATTR_LEN(last->e_name_len);
if (!last->e_value_block && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
}
+ if (total)
+ *total += EXT4_XATTR_LEN(last->e_name_len);
}
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
@@ -745,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &bs->s;
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
#define header(x) ((struct ext4_xattr_header *)(x))
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
bs->bh->b_blocknr);
+ BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
goto cleanup;
@@ -769,7 +796,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!IS_LAST_ENTRY(s->first))
ext4_xattr_rehash(header(s->base),
s->here);
- ext4_xattr_cache_insert(bs->bh);
+ ext4_xattr_cache_insert(ext4_mb_cache,
+ bs->bh);
}
unlock_buffer(bs->bh);
if (error == -EIO)
@@ -837,6 +865,7 @@ inserted:
EXT4_C2B(EXT4_SB(sb), 1));
if (error)
goto cleanup;
+ BUFFER_TRACE(new_bh, "get_write_access");
error = ext4_journal_get_write_access(handle,
new_bh);
if (error)
@@ -874,7 +903,7 @@ inserted:
* take i_data_sem because we will test
* i_delalloc_reserved_flag in ext4_mb_new_blocks
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
up_read((&EXT4_I(inode)->i_data_sem));
@@ -905,7 +934,7 @@ getblk_failed:
memcpy(new_bh->b_data, s->base, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext4_xattr_cache_insert(new_bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
error = ext4_handle_dirty_xattr_block(handle,
inode, new_bh);
if (error)
@@ -1228,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_xattr_block_find *bs = NULL;
char *buffer = NULL, *b_entry_name = NULL;
size_t min_offs, free;
- int total_ino, total_blk;
+ int total_ino;
void *base, *start, *end;
int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1315,7 @@ retry:
first = BFIRST(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
- free = ext4_xattr_free_space(first, &min_offs, base,
- &total_blk);
+ free = ext4_xattr_free_space(first, &min_offs, base, NULL);
if (free < new_extra_isize) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
@@ -1350,6 +1378,9 @@ retry:
s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
+ kfree(is); is = NULL;
+ kfree(bs); bs = NULL;
+ brelse(bh);
goto retry;
}
error = -1;
@@ -1492,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+ ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
if (!ce) {
ea_bdebug(bh, "out of memory");
return;
@@ -1570,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+ ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
hash);
while (ce) {
struct buffer_head *bh;
@@ -1673,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-ext4_init_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
{
- ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
- if (!ext4_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(name, HASH_BUCKET_BITS);
}
-void
-ext4_exit_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
- if (ext4_xattr_cache)
- mb_cache_destroy(ext4_xattr_cache);
- ext4_xattr_cache = NULL;
+ if (cache)
+ mb_cache_destroy(cache);
}
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index c767dbdd7fc..29bedf5589f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern const struct xattr_handler ext4_xattr_acl_default_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -112,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
-
extern const struct xattr_handler *ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -126,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);