diff options
Diffstat (limited to 'fs/ext4')
35 files changed, 15371 insertions, 7605 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9ed1bb1f319..efea5d5c44c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,6 +2,8 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 + select CRYPTO + select CRYPTO_CRC32C help This is the next generation of the ext3 filesystem. @@ -37,22 +39,9 @@ config EXT4_USE_FOR_EXT23 compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR + depends on EXT4_FS select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -65,7 +54,7 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR + depends on EXT4_FS help Security labels support alternative access control models implemented by security modules like SELinux. This option @@ -82,4 +71,5 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f86593..0310fec2ee3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a5c29bb3b83..d40c8dbbb0d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -55,16 +55,23 @@ ext4_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -98,13 +105,19 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch (acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext4_acl_entry); + break; case ACL_GROUP: - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; @@ -139,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -183,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext4_new_inode */ static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -191,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -235,205 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; + handle_t *handle; + int error, retries = 0; - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext4_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; +retry: + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); + error = __ext4_set_acl(handle, inode, type, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; return error; } /* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. + * Initialize the ACLs of a new inode. Called from ext4_new_inode. * - * inode->i_mutex: down + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ int -ext4_acl_chmod(struct inode *inode) +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; - handle_t *handle; - int retries = 0; + struct posix_acl *default_acl, *acl; int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; -retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - posix_acl_release(acl); - return error; -} -/* - * Extended attribute handlers - */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext4_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext4_set_acl(handle, inode, type, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext4_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 18cb39ed7c7..da2c79577d7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext4_set_acl NULL static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9e2cd8cf71..fca382037dd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, */ /* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (block - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ @@ -49,22 +66,26 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ -unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned num_clusters; int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; @@ -90,8 +111,8 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { - block_cluster = EXT4_B2C(sbi, (start - - ext4_block_bitmap(sb, gdp))); + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { @@ -102,7 +123,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, - start - ext4_inode_bitmap(sb, gdp)); + ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { @@ -114,7 +135,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, start - itbl_blk + i); + c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; @@ -155,25 +176,35 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -210,6 +241,8 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + ext4_group_desc_csum_set(sb, block_group, gdp); } /* Return the number of free blocks in a block group. It is used when @@ -275,14 +308,19 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; } -static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) +/* + * Return the block number which was discovered to be invalid, or 0 if + * the block bitmap is valid. + */ +static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; - ext4_fsblk_t bitmap_blk; + ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { @@ -292,41 +330,77 @@ static int ext4_valid_block_bitmap(struct super_block *sb, * or it has to also read the block group where the bitmaps * are located to verify they are set. */ - return 1; + return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ - bitmap_blk = ext4_block_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + blk = ext4_block_bitmap(sb, desc); + offset = blk - group_first_block; + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ - goto err_out; + return blk; /* check whether the inode bitmap block number is set */ - bitmap_blk = ext4_inode_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + blk = ext4_inode_bitmap(sb, desc); + offset = blk - group_first_block; + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ - goto err_out; + return blk; /* check whether the inode table block number is set */ - bitmap_blk = ext4_inode_table(sb, desc); - offset = bitmap_blk - group_first_block; + blk = ext4_inode_table(sb, desc); + offset = blk - group_first_block; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - offset + EXT4_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) - /* good bitmap for inode tables */ - return 1; - -err_out: - ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", - block_group, bitmap_blk); + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group)) + /* bad bitmap for inode tables */ + return blk; return 0; } + +static void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return; + + ext4_lock_group(sb, block_group); + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + return; + } + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + return; + } + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); +} + /** - * ext4_read_block_bitmap() + * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @@ -336,10 +410,10 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; - struct buffer_head *bh = NULL; + struct buffer_head *bh; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -348,19 +422,19 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -379,28 +453,62 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ + set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { - put_bh(bh); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); + return bh; +verify: + ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (buffer_verified(bh)) + return bh; + put_bh(bh); + return NULL; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; + } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ + ext4_validate_block_bitmap(sb, desc, block_group, bh); + /* ...but check for error just in case errors=continue. */ + return !buffer_verified(bh); +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (!bh) + return NULL; + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); return NULL; } - ext4_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, - * continue with corrupt bitmap - */ return bh; } @@ -416,31 +524,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - s64 free_clusters, dirty_clusters, root_clusters; + s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); - root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { - free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ - if (sbi->s_resuid == current_fsuid() || - ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } @@ -538,6 +659,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; @@ -553,13 +675,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_group_clusters(sb, gdp); + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; - x = ext4_count_free(bitmap_bh, sb->s_blocksize); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; @@ -567,7 +694,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", - EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else @@ -576,7 +703,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_group_clusters(sb, gdp); + grp = NULL; + if (EXT4_SB(sb)->s_group_info) + grp = ext4_get_group_info(sb, i); + if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; @@ -585,21 +716,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) static inline int test_root(ext4_group_t a, int b) { - int num = b; - - while (a > num) - num *= b; - return num == a; -} - -static int ext4_group_sparse(ext4_group_t group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } } /** @@ -612,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group) */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext4_group_sparse(group)) + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; return 0; - return 1; + } + if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + return 1; + if (!(group & 1)) + return 0; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index fa3af81ac56..3285aa5a706 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -11,21 +11,92 @@ #include <linux/jbd2.h> #include "ext4.h" -#ifdef EXT4FS_DEBUG +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) +{ + return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); +} + +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; -unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) { - unsigned int i, sum = 0; - - if (!map) - return 0; - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return sum; + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } -#endif /* EXT4FS_DEBUG */ +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + if (provided == calculated) + return 1; + + return 0; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh) +{ + int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3f11656bd72..41eb9dcfac7 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb) /* Called when the filesystem is unmounted */ void ext4_release_system_zone(struct super_block *sb) { - struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; - struct rb_node *parent; - struct ext4_system_zone *entry; + struct ext4_system_zone *entry, *n; - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - entry = rb_entry(n, struct ext4_system_zone, node); + rbtree_postorder_for_each_entry_safe(entry, n, + &EXT4_SB(sb)->system_blks, node) kmem_cache_free(ext4_system_zone_cachep, entry); - if (!parent) - EXT4_SB(sb)->system_blks = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + EXT4_SB(sb)->system_blks = RB_ROOT; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c56092e5..ef1bed66c14 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -27,48 +27,43 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include "ext4.h" +#include "xattr.h" -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int ext4_readdir(struct file *, void *, filldir_t); -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir); -static int ext4_release_dir(struct inode *inode, - struct file *filp); - -const struct file_operations ext4_dir_operations = { - .llseek = ext4_llseek, - .read = generic_read_dir, - .readdir = ext4_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .fsync = ext4_sync_file, - .release = ext4_release_dir, -}; - +static int ext4_dx_readdir(struct file *, struct dir_context *); -static unsigned char get_dtype(struct super_block *sb, int filetype) +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; + struct super_block *sb = inode->i_sb; - return (ext4_filetype_table[filetype]); + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) + return 1; + + return 0; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, - struct buffer_head *bh, + struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; @@ -81,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - bh->b_data) + rlen > - dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -91,73 +85,72 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 0; if (filp) - ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else - ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return 1; } -static int ext4_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned int offset; - int i, stored; + int i; struct ext4_dir_entry_2 *de; - struct super_block *sb; int err; - struct inode *inode = filp->f_path.dentry->d_inode; - int ret = 0; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { - err = ext4_dx_readdir(filp, dirent, filldir); + if (is_dx_dir(inode)) { + err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; + return err; } /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + ext4_clear_inode_flag(file_inode(file), EXT4_INODE_INDEX); } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + int ret = ext4_read_inline_dir(file, ctx, + &has_inline_data); + if (has_inline_data) + return ret; + } + + offset = ctx->pos & (sb->s_blocksize - 1); + + while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; struct buffer_head *bh = NULL; - map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); } @@ -167,25 +160,37 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_FILE(filp, 0, + EXT4_ERROR_FILE(file, 0, "directory contains a " "hole at offset %llu", - (unsigned long long) filp->f_pos); + (unsigned long long) ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } -revalidate: + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_FILE(file, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; + brelse(bh); + continue; + } + set_buffer_verified(bh); + /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -202,74 +207,129 @@ revalidate: sb->s_blocksize); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, file, de, bh, + bh->b_data, bh->b_size, + offset)) { /* - * On error, skip the f_pos to the next block + * On error, skip to the next block */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - ret = stored; - goto out; + break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, + if (!dir_emit(ctx, de->name, de->name_len, - filp->f_pos, le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = 0; brelse(bh); + if (ctx->pos < inode->i_size) { + if (!dir_relax(inode)) + return 0; + } } -out: - return ret; + return 0; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif } /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() calls generic_file_llseek_size to handle htree + * directories, where the "offset" is in terms of the filename hash + * value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond offset limits, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * For non-htree, ext4_llseek already chooses the proper max offset. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext4_get_htree_eof(file); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + return ext4_llseek(file, offset, whence); +} /* * This structure holds the nodes of the red-black tree used to store @@ -292,53 +352,29 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + + *root = RB_ROOT; } -static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -413,63 +449,57 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file *filp, void *dirent, - filldir_t filldir, struct fname *fname) +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block *sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "EXT4-fs: call_filldir: called with " - "null fname?!?\n"); + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, + fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return 1; } fname = fname->next; } return 0; } -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext4_htree_create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == EXT4_HTREE_EOF) + if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -477,7 +507,7 @@ static int ext4_dx_readdir(struct file *filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -491,17 +521,17 @@ static int ext4_dx_readdir(struct file *filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext4_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT4_HTREE_EOF; + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -510,7 +540,7 @@ static int ext4_dx_readdir(struct file *filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -521,7 +551,7 @@ static int ext4_dx_readdir(struct file *filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT4_HTREE_EOF; + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -529,7 +559,7 @@ static int ext4_dx_readdir(struct file *filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -540,3 +570,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .iterate = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 513004fc3d8..7cc5a0e2368 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,6 +29,9 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> +#include <linux/ratelimit.h> +#include <crypto/hash.h> +#include <linux/falloc.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -53,7 +56,17 @@ printk(KERN_DEBUG f, ## a); \ } while (0) #else -#define ext4_debug(f, a...) do {} while (0) +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ @@ -110,6 +123,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -143,7 +158,6 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -#define EXT4_MAP_UNINIT (1 << BH_Uninit) /* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of * ext4_map_blocks wants to know whether or not the underlying cluster has * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that @@ -154,7 +168,7 @@ struct ext4_allocation_request { #define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -164,53 +178,31 @@ struct ext4_map_blocks { }; /* - * For delayed allocation tracking - */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_QUEUED 0x0004 - -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ unsigned int flag; /* unwritten or not */ - struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct work_struct work; /* data work queue */ - struct kiocb *iocb; /* iocb struct for AIO */ - int result; /* error value for AIO */ - int num_io_pages; - struct ext4_io_page *pages[MAX_IO_PAGES]; + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { int io_op; struct bio *io_bio; ext4_io_end_t *io_end; - struct ext4_io_page *io_page; sector_t io_next_block; }; @@ -276,6 +268,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor @@ -289,7 +291,9 @@ struct ext4_group_desc __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ @@ -299,17 +303,27 @@ struct ext4_group_desc __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ - __u32 bg_reserved2[3]; + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; }; +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + /* * Structure of a flex block group info */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -371,10 +385,11 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -427,28 +442,26 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); @@ -473,6 +486,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -522,33 +536,51 @@ enum { /* * Flags used by ext4_map_blocks() */ - /* Allocate any needed blocks and/or convert an unitialized + /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 - /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an - unitialized extents if not allocated, split the uninitialized + unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Do not take i_data_sem locking in ext4_map_blocks */ +#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x0400 +#define EXT4_EX_FORCE_CACHE 0x0800 /* * Flags used by ext4_free_blocks @@ -559,11 +591,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 - -/* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 +#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -584,6 +612,8 @@ enum { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -641,7 +671,8 @@ struct ext4_inode { __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ @@ -657,7 +688,7 @@ struct ext4_inode { } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; - __le16 i_pad1; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ @@ -742,6 +773,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime.tv_sec = \ (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ @@ -759,7 +792,7 @@ do { \ #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 +#define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) @@ -776,16 +809,7 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ -/* - * storage for cached extent - * If ec_len == 0, then the cache is invalid. - * If ec_start == 0, then the cache represents a gap (null mapping) - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ -}; +#include "extents_status.h" /* * fourth extended file system inode data in memory @@ -809,7 +833,6 @@ struct ext4_inode_info { #endif unsigned long i_flags; -#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -818,7 +841,6 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; -#endif struct list_head i_orphan; /* unlinked but open inodes */ @@ -853,7 +875,8 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - struct ext4_ext_cache i_cached_extent; + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + /* * File creation time. Its function is same as that of * struct timespec i_{a,c,m}time in the generic inode. @@ -864,6 +887,13 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_lru; + unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ + /* ialloc */ ext4_group_t i_last_alloc_group; @@ -878,18 +908,29 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif - /* completed IOs that might need unwritten extents handling */ - struct list_head i_completed_io_list; + /* Lock protecting lists below */ spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + /* + * Completed IOs that need unwritten extents handling and don't have + * transaction reserved + */ atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - /* current io_end structure for async DIO write*/ - ext4_io_end_t *cur_aio_dio; - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + struct work_struct i_rsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -899,6 +940,9 @@ struct ext4_inode_info { */ tid_t i_sync_tid; tid_t i_datasync_tid; + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; }; /* @@ -916,13 +960,14 @@ struct ext4_inode_info { #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* - * Mount flags + * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ @@ -941,16 +986,24 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ -#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -992,6 +1045,9 @@ extern void ext4_set_bits(void *bm, int cur, int len); #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + /* * Structure of the super block */ @@ -1078,7 +1134,7 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad; + __u8 s_checksum_type; /* metadata checksum algorithm used */ __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ @@ -1104,7 +1160,9 @@ struct ext4_super_block { __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[109]; /* Padding to the end of the block */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __le32 s_reserved[106]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1131,8 +1189,7 @@ struct ext4_sb_info { unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ + unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ @@ -1142,9 +1199,11 @@ struct ext4_sb_info { unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned int s_mount_flags; + unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; @@ -1166,6 +1225,7 @@ struct ext4_sb_info { struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; + struct super_block *s_sb; /* Journaling */ struct journal_s *s_journal; @@ -1193,9 +1253,6 @@ struct ext4_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif - /* ext4 extent cache stats */ - unsigned long extent_cache_hits; - unsigned long extent_cache_misses; /* for buddy allocator */ struct ext4_group_info ***s_group_info; @@ -1203,6 +1260,7 @@ struct ext4_sb_info { spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; + unsigned int s_group_info_size; /* tunables */ unsigned long s_stripe; @@ -1212,7 +1270,7 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_max_writeback_mb_bump; + unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -1240,11 +1298,15 @@ struct ext4_sb_info { unsigned long s_sectors_written_start; u64 s_kbytes_written; + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; @@ -1259,6 +1321,25 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; + unsigned long s_es_last_sorted; + struct percpu_counter s_extent_cache_cnt; + struct mb_cache *s_mb_cache; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1279,6 +1360,9 @@ static inline struct timespec ext4_current_time(struct inode *inode) static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || + ino == EXT4_USR_QUOTA_INO || + ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -1290,10 +1374,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + atomic_inc(&EXT4_I(inode)->i_unwritten); } } +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + /* * Inode dynamic state flags */ @@ -1307,6 +1401,11 @@ enum { EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1323,7 +1422,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) @@ -1397,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1407,6 +1518,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 @@ -1420,8 +1537,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1445,7 +1563,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1453,7 +1572,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ - EXT4_FEATURE_RO_COMPAT_BIGALLOC) + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA) /* * Default values for user and/or group using reserved blocks @@ -1519,6 +1640,23 @@ struct ext4_dir_entry_2 { }; /* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ @@ -1533,6 +1671,8 @@ struct ext4_dir_entry_2 { #define EXT4_FT_MAX 8 +#define EXT4_FT_DIR_CSUM 0xDE + /* * EXT4_DIR_PAD defines the directory entries boundaries * @@ -1601,6 +1741,27 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + #ifdef __KERNEL__ /* hash info structure used by the directory hash */ @@ -1612,7 +1773,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT4_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext4_htree_next_block @@ -1663,9 +1828,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR -75000 -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - /* * Timeout and state flag for lazy initialization inode thread. */ @@ -1729,7 +1891,8 @@ struct mmp_struct { __le16 mmp_check_interval; __le16 mmp_pad1; - __le32 mmp_pad2[227]; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ @@ -1771,9 +1934,28 @@ struct mmpd_data { # define NORET_AND noreturn, /* bitmap.c */ -extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); /* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, @@ -1794,45 +1976,82 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern void ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_completed_IO(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, - const struct qstr *qstr, __u32 goal, - uid_t *owner); +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, int handle_type, + unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + (type), __LINE__, (nblocks)) + + extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -1841,11 +2060,12 @@ extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; -extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); @@ -1856,6 +2076,8 @@ extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, @@ -1867,8 +2089,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); @@ -1881,9 +2118,10 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -1891,9 +2129,8 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1903,11 +2140,12 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs); + struct iov_iter *iter, loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -1915,12 +2153,29 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -1931,45 +2186,105 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); + extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ - __LINE__, ## message) extern __printf(5, 6) -void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern __printf(5, 6) -void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_abort(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ - __LINE__, ## message) extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ - __LINE__, ## message) extern __printf(3, 4) -void ext4_msg(struct super_block *, const char *, const char *, ...); +void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ - __LINE__, msg) extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); -#define ext4_grp_locked_error(sb, grp, message...) \ - __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -2006,10 +2321,19 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM | + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); +} static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -2071,6 +2395,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, { struct ext4_group_info ***grp_info; long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); grp_info = EXT4_SB(sb)->s_group_info; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); @@ -2117,16 +2442,15 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !mutex_is_locked(&inode->i_mutex)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; up_write(&EXT4_I(inode)->i_data_sem); - return ; } struct ext4_group_info { @@ -2149,9 +2473,15 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) @@ -2204,12 +2534,6 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } -static inline void ext4_mark_super_dirty(struct super_block *sb) -{ - if (EXT4_SB(sb)->s_journal == NULL) - sb->s_dirt =1; -} - /* * Block validity checking */ @@ -2235,10 +2559,104 @@ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); +/* inline.c */ +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2256,66 +2674,100 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); -extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, - int chunk); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); + /* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern int ext4_end_io_nolock(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc); + struct writeback_control *wbc, + bool keep_towrite); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); -/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +/* + * Note that these flags will never ever appear in a buffer_head's state flag. + * See EXT4_MAP_... to see where this is used. + */ enum ext4_state_bits { - BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, - BH_AllocFromCluster, /* allocated blocks were part of already - * allocated cluster. Note that this flag will - * never, ever appear in a buffer_head's state - * flag. See EXT4_MAP_FROM_CLUSTER to see where - * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ + BH_AllocFromCluster /* allocated blocks were part of already + * allocated cluster. */ + = BH_JBDPrivateStart }; -BUFFER_FNS(Uninit, uninit) -TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) - /* - * Add new method to test wether block and inode bitmaps are properly + * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ @@ -2331,6 +2783,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ @@ -2348,6 +2815,4 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ -#include "ext4_extents.h" - #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a52db3a69a3..a867f5ca999 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -43,16 +43,6 @@ #define CHECK_BINSEARCH__ /* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(a...) printk(a) -#else -#define ext_debug(a...) -#endif - -/* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ @@ -63,9 +53,22 @@ * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. + * For non-inode extent blocks, ext4_extent_tail + * follows the array. */ /* + * This is the extent tail on-disk structure. + * All other extent structures are 12 bytes long. It turns out that + * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which + * covers all valid ext4 block sizes. Therefore, this tail structure can be + * crammed into the end of the block without having to rebalance the tree. + */ +struct ext4_extent_tail { + __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ +}; + +/* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ @@ -101,6 +104,17 @@ struct ext4_extent_header { #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_EXTENT_TAIL_OFFSET(hdr) \ + (sizeof(struct ext4_extent_header) + \ + (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) + +static inline struct ext4_extent_tail * +find_ext4_extent_tail(struct ext4_extent_header *eh) +{ + return (struct ext4_extent_tail *)(((void *)eh) + + EXT4_EXTENT_TAIL_OFFSET(eh)); +} + /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. @@ -120,44 +134,24 @@ struct ext4_ext_path { */ /* - * to be called by ext4_ext_walk_space() - * negative retcode - error - * positive retcode - signal for ext4_ext_walk_space(), see below - * callback must return valid extent (passed or newly created) - */ -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, - struct ext4_ext_cache *, - struct ext4_extent *, void *); - -#define EXT_CONTINUE 0 -#define EXT_BREAK 1 -#define EXT_REPEAT 2 - -/* - * Maximum number of logical blocks in a file; ext4_extent's ee_block is - * __le32. - */ -#define EXT_MAX_BLOCKS 0xffffffff - -/* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. + * particular extent is an initialized extent or an unwritten (i.e. * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) -#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ @@ -193,20 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void -ext4_ext_invalidate_cache(struct inode *inode) +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { - EXT4_I(inode)->i_cached_extent.ec_len = 0; -} - -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) -{ - /* We can not have an uninitialized extent of zero length! */ + /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); @@ -276,21 +264,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse); +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index aca17901758..0074e0d23d6 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,11 +6,154 @@ #include <trace/events/ext4.h> +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +static int ext4_journal_check_start(struct super_block *sb) +{ + journal_t *journal; + + might_sleep(); + if (sb->s_flags & MS_RDONLY) + return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); + journal = EXT4_SB(sb)->s_journal; + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (journal && is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return -EROFS; + } + return 0; +} + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks) +{ + journal_t *journal; + int err; + + trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) + return ERR_PTR(err); + + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, + type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type) +{ + struct super_block *sb; + int err; + + if (!ext4_handle_valid(handle)) + return ext4_get_nojournal(); + + sb = handle->h_journal->j_private; + trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, + _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) { + jbd2_journal_free_reserved(handle); + return ERR_PTR(err); + } + + err = jbd2_journal_start_reserved(handle, type, line); + if (err < 0) + return ERR_PTR(err); + return handle; +} + +static void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) @@ -107,12 +250,35 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, { int err = 0; + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); + /* Errors can only happen if there is a bug */ + if (WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) @@ -143,12 +309,13 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, struct buffer_head *bh = EXT4_SB(sb)->s_sbh; int err = 0; + ext4_superblock_csum_set(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } else - sb->s_dirt = 1; + mark_buffer_dirty(bh); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1dab1..17c00ff202f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -29,11 +29,13 @@ * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) + ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -59,12 +61,6 @@ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) - /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always @@ -87,14 +83,20 @@ #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ -#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) +#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ -#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) - -#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) +#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_INIT_REWRITE) : 0) + +#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ + +3+DQUOT_DEL_REWRITE) : 0) #else #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 @@ -104,6 +106,113 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +static inline int ext4_jbd2_credits_xattr(struct inode *inode) +{ + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + return credits; +} + + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC 0 +#define EXT4_HT_INODE 1 +#define EXT4_HT_WRITE_PAGE 2 +#define EXT4_HT_MAP_BLOCKS 3 +#define EXT4_HT_DIR 4 +#define EXT4_HT_TRUNCATE 5 +#define EXT4_HT_QUOTA 6 +#define EXT4_HT_RESIZE 7 +#define EXT4_HT_MIGRATE 8 +#define EXT4_HT_MOVE_EXTENTS 9 +#define EXT4_HT_XATTR 10 +#define EXT4_HT_EXT_CONVERT 11 +#define EXT4_HT_MAX 12 + +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + * Return true if object was successfully removed + */ +static inline bool ext4_journal_callback_try_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + bool deleted; + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + return deleted; +} + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -122,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); @@ -156,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_super(handle, sb) \ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -176,13 +282,6 @@ static inline void ext4_handle_sync(handle_t *handle) handle->h_sync = 1; } -static inline void ext4_handle_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) @@ -197,14 +296,38 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) return 1; } -static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +#define ext4_journal_start_sb(sb, type, nblocks) \ + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start(inode, type, nblocks) \ + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, + unsigned int line, int type, + int blocks, int rsv_blocks) { - return ext4_journal_start_sb(inode->i_sb, nblocks); + return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + rsv_blocks); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) +#define ext4_journal_start_reserved(handle, type) \ + __ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type); + +static inline void ext4_journal_free_reserved(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_free_reserved(handle); +} + static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); @@ -261,43 +384,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74f23c292e1..4da228a0e6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,13 +37,65 @@ #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "xattr.h" #include <trace/events/ext4.h> +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ + +#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ +#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ + +static __le32 ext4_extent_block_csum(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + EXT4_EXTENT_TAIL_OFFSET(eh)); + return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + et = find_ext4_extent_tail(eh); + if (et->et_checksum != ext4_extent_block_csum(inode, eh)) + return 0; + return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + et = find_ext4_extent_tail(eh); + et->et_checksum = ext4_extent_block_csum(inode, eh); +} + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -51,6 +103,16 @@ static int ext4_split_extent(handle_t *handle, int split_flag, int flags); +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags); + +static int ext4_find_delayed_extent(struct inode *inode, + struct extent_status *newes); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -81,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, { if (path->p_bh) { /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ @@ -94,14 +157,12 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) { int err; if (path->p_bh) { + ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); @@ -299,7 +360,11 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; + if (lblock > last) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -324,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -344,7 +424,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth) + int depth, ext4_fsblk_t pblk) { const char *error_msg; int max = 0; @@ -374,25 +454,158 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + /* Verify checksum on non-root extent tree nodes */ + if (ext_depth(inode) != depth && + !ext4_extent_block_csum_verify(inode, eh)) { + error_msg = "extent tree corrupted"; + goto corrupted; + } return 0; corrupted: ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return -EIO; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); +} + +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(inode->i_sb, pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; + set_buffer_verified(bh); + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + +} + +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; } #ifdef EXT_DEBUG @@ -408,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -434,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -465,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -519,7 +732,7 @@ ext4_ext_binsearch_idx(struct inode *inode, } path->p_idx = l - 1; - ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH @@ -590,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode, ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -622,17 +835,17 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - ext4_ext_invalidate_cache(inode); return 0; } struct ext4_ext_path * ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; short int depth, i, ppos = 0, alloc = 0; + int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -651,8 +864,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { - int need_to_validate = 0; - ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -661,33 +872,24 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto err; - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto err; - } - /* validate the extent entries */ - need_to_validate = 1; } + eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); + ret = -EIO; goto err; } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; - i--; - - if (need_to_validate && ext4_ext_check(inode, eh, i)) - goto err; } path[ppos].p_depth = i; @@ -708,7 +910,7 @@ err: ext4_ext_drop_refs(path); if (alloc) kfree(path); - return ERR_PTR(-EIO); + return ERR_PTR(ret); } /* @@ -862,8 +1064,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -897,6 +1099,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -934,8 +1137,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -975,6 +1178,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1046,11 +1250,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return err; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - ext4_std_error(inode->i_sb, err); - return err; - } + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); @@ -1072,6 +1273,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1094,7 +1296,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + le16_add_cpu(&neh->eh_depth, 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1108,7 +1310,8 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, + unsigned int mb_flags, + unsigned int gb_flags, struct ext4_ext_path *path, struct ext4_extent *newext) { @@ -1130,7 +1333,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; @@ -1138,12 +1341,12 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); if (err) goto out; @@ -1151,7 +1354,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1322,28 +1525,21 @@ got_index: ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { - put_bh(bh); - return -EIO; - } + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = ext_block_hdr(bh); - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { - put_bh(bh); - return -EIO; - } ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); @@ -1487,20 +1683,17 @@ int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - unsigned short ext1_ee_len, ext2_ee_len, max_len; + unsigned short ext1_ee_len, ext2_ee_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * unwritten extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; - if (ext4_ext_is_uninitialized(ex1)) - max_len = EXT_UNINIT_MAX_LEN; - else - max_len = EXT_INIT_MAX_LEN; - ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1513,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (ext1_ee_len + ext2_ee_len > max_len) + if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) + return 0; + if (ext4_ext_is_unwritten(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1538,8 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; - int uninitialized = 0; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1549,12 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1572,16 +1768,61 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, } /* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2)) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + brelse(path[1].p_bh); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | + EXT4_FREE_BLOCKS_RESERVE); +} + +/* * This function tries to merge the @ex extent to neighbours in the tree. * return 1 if merge left else 0. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; - int ret = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1591,9 +1832,9 @@ static int ext4_ext_try_to_merge(struct inode *inode, merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); + (void) ext4_ext_try_to_merge_right(inode, path, ex); - return ret; + ext4_ext_try_to_merge_up(handle, inode, path); } /* @@ -1618,8 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1629,7 +1869,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -1656,7 +1896,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) + struct ext4_extent *newext, int gb_flags) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1664,8 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - unsigned uninitialized = 0; - int flags = 0; + int mb_flags = 0, unwritten; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1673,39 +1912,86 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } depth = ext_depth(inode); ex = path[depth].p_ext; + eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EIO; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_ext_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; + if (unwritten) + ext4_ext_mark_unwritten(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_unwritten(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_unwritten(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + unwritten = ext4_ext_is_unwritten(ex); + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (unwritten) + ext4_ext_mark_unwritten(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } } depth = ext_depth(inode); @@ -1721,7 +2007,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); + npath = ext4_ext_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -1740,9 +2026,10 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags = EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -1760,7 +2047,7 @@ has_space: ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -1771,7 +2058,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -1782,7 +2069,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -1792,7 +2079,7 @@ has_space: "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -1807,49 +2094,53 @@ has_space: nearex->ee_len = newext->ee_len; merge: - /* try to merge extents to the right */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, nearex); + /* try to merge extents */ + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(handle, inode, path, nearex); - /* try to merge extents to the left */ /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto cleanup; - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: if (npath) { ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_invalidate_cache(inode); return err; } -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache cbex; struct ext4_extent *ex; - ext4_lblk_t next, start = 0, end = 0; + struct extent_status es; + ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; - int depth, exists, err = 0; - - BUG_ON(func == NULL); - BUG_ON(inode == NULL); + int exists, depth = 0, err = 0; + unsigned int flags = 0; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, block, path); - up_read(&EXT4_I(inode)->i_data_sem); + + if (path && ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + path = ext4_ext_find_extent(inode, block, path, 0); if (IS_ERR(path)) { + up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; @@ -1857,13 +2148,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { + up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EIO; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); + ext4_ext_drop_refs(path); + flags = 0; exists = 0; if (!ex) { /* there is no extent yet, so try to allocate @@ -1900,40 +2194,75 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, BUG_ON(end <= start); if (!exists) { - cbex.ec_block = start; - cbex.ec_len = end - start; - cbex.ec_start = 0; + es.es_lblk = start; + es.es_len = end - start; + es.es_pblk = 0; } else { - cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext4_ext_pblock(ex); + es.es_lblk = le32_to_cpu(ex->ee_block); + es.es_len = ext4_ext_get_actual_len(ex); + es.es_pblk = ext4_ext_pblock(ex); + if (ext4_ext_is_unwritten(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; } - if (unlikely(cbex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); - err = -EIO; - break; + /* + * Find delayed extent and update es accordingly. We call + * it even in !exists case to find out whether es is the + * last existing extent or not. + */ + next_del = ext4_find_delayed_extent(inode, &es); + if (!exists && next_del) { + exists = 1; + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); } - err = func(inode, next, &cbex, ex, cbdata); - ext4_ext_drop_refs(path); + up_read(&EXT4_I(inode)->i_data_sem); - if (err < 0) + if (unlikely(es.es_len == 0)) { + EXT4_ERROR_INODE(inode, "es.es_len == 0"); + err = -EIO; break; + } - if (err == EXT_REPEAT) - continue; - else if (err == EXT_BREAK) { - err = 0; - break; + /* + * This is possible iff next == next_del == EXT_MAX_BLOCKS. + * we need to check next == EXT_MAX_BLOCKS because it is + * possible that an extent is with unwritten and delayed + * status due to when an extent is delayed allocated and + * is allocated by fallocate status tree will track both of + * them in a extent. + * + * So we could return a unwritten and delayed extent, and + * its block is equal to 'next'. + */ + if (next == next_del && next == EXT_MAX_BLOCKS) { + flags |= FIEMAP_EXTENT_LAST; + if (unlikely(next_del != EXT_MAX_BLOCKS || + next != EXT_MAX_BLOCKS)) { + EXT4_ERROR_INODE(inode, + "next extent == %u, next " + "delalloc extent = %u", + next, next_del); + err = -EIO; + break; + } } - if (ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; + if (exists) { + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } } - block = cbex.ec_block + cbex.ec_len; + block = es.es_lblk + es.es_len; } if (path) { @@ -1944,21 +2273,6 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, return err; } -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start) -{ - struct ext4_ext_cache *cex; - BUG_ON(len == 0); - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - trace_ext4_ext_put_in_cache(inode, block, len, start); - cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_block = block; - cex->ec_len = len; - cex->ec_start = start; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into @@ -1969,15 +2283,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len; - ext4_lblk_t lblock; + unsigned long len = 0; + ext4_lblk_t lblock = 0; struct ext4_extent *ex; ex = path[depth].p_ext; if (ex == NULL) { - /* there is no extent yet, so gap is [0;-] */ - lblock = 0; - len = EXT_MAX_BLOCKS; + /* + * there is no extent yet, so gap is [0;-] and we + * don't cache it + */ ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -1986,6 +2301,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -1999,108 +2317,29 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else { - lblock = len = 0; BUG(); } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0); -} - -/* - * ext4_ext_check_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ - struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; - int ret = 0; - - /* - * We borrow i_block_reservation_lock to protect i_cached_extent - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); - - /* has cache valid data? */ - if (cex->ec_len == 0) - goto errout; - - if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); - ext_debug("%u cached by %u:%u:%llu\n", - block, - cex->ec_block, cex->ec_len, cex->ec_start); - ret = 1; - } -errout: - if (!ret) - sbi->extent_cache_misses++; - else - sbi->extent_cache_hits++; - trace_ext4_ext_in_cache(inode, block, ret); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return ret; -} - -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; } - /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ - path--; + depth--; + path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); @@ -2125,6 +2364,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + + while (--depth >= 0) { + if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + break; + path--; + err = ext4_ext_get_access(handle, inode, path); + if (err) + break; + path->p_idx->ei_block = (path+1)->p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path); + if (err) + break; + } return err; } @@ -2164,22 +2416,26 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, } /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents? * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth = ext_depth(inode); + int depth; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; - if (chunk) + depth = ext_depth(inode); + + if (extents <= 1) index = depth * 2; else index = depth * 3; @@ -2187,18 +2443,25 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) return index; } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - ext4_fsblk_t *partial_cluster, + long long *partial_cluster, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = get_default_free_blocks_flags(inode); - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we make a note @@ -2215,7 +2478,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if ((*partial_cluster > 0) && + (EXT4_B2C(sbi, pblk) != *partial_cluster)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2241,41 +2505,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; + unsigned int unaligned; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, pblk); + /* + * Usually we want to free partial cluster at the end of the + * extent, except for the situation when the cluster is still + * used by any other extent (partial_cluster is negative). + */ + if (*partial_cluster < 0 && + -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + ext_debug("free last %u blocks starting %llu partial %lld\n", + num, pblk, *partial_cluster); ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* * If the block range to be freed didn't start at the * beginning of a cluster, and we removed the entire - * extent, save the partial cluster here, since we - * might need to delete if we determine that the - * truncate operation has removed all of the blocks in - * the cluster. + * extent and the cluster is not used by any other extent, + * save the partial cluster here, since we might need to + * delete if we determine that the truncate operation has + * removed all of the blocks in the cluster. + * + * On the other hand, if we did not manage to free the whole + * extent, we have to mark the cluster as used (store negative + * cluster number in partial_cluster). */ - if (pblk & (sbi->s_cluster_ratio - 1) && - (ee_len == num)) + unaligned = EXT4_PBLK_COFF(sbi, pblk); + if (unaligned && (ee_len == num) && + (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); - else + else if (unaligned) + *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); + else if (*partial_cluster > 0) *partial_cluster = 0; - } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); - - } else { - printk(KERN_INFO "strange request: removal(2) " - "%u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); - } + } else + ext4_error(sbi->s_sb, "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -2288,12 +2557,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. It gets negative in case + * that the cluster is still used. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + struct ext4_ext_path *path, + long long *partial_cluster, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2304,11 +2577,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; + ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf\n", start); + ext_debug("truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2317,23 +2591,46 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, return -EIO; } /* find where to start removing */ - ex = EXT_LAST_EXTENT(eh); + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + /* + * If we're starting with an extent other than the last one in the + * node, we need to see if it shares a cluster with the extent to + * the right (towards the end of the file). If its leftmost cluster + * is this extent's rightmost cluster and it is not cluster aligned, + * we'll mark it as a partial that is not to be deallocated. + */ + + if (ex != EXT_LAST_EXTENT(eh)) { + ext4_fsblk_t current_pblk, right_pblk; + long long current_cluster, right_cluster; + + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); + right_pblk = ext4_ext_pblock(ex + 1); + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); + if (current_cluster == right_cluster && + EXT4_PBLK_COFF(sbi, right_pblk)) + *partial_cluster = -right_cluster; + } + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2343,14 +2640,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext_debug(" border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ - if (end <= ex_ee_block) { + if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so if this extent is not cluster aligned we have + * to mark the current cluster as used to avoid + * accidentally freeing it later on + */ + pblk = ext4_ext_pblock(ex); + if (EXT4_PBLK_COFF(sbi, pblk)) + *partial_cluster = + -((long long)EXT4_B2C(sbi, pblk)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { - EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", - start, end); + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); err = -EIO; goto out; } else if (a != ex_ee_block) { @@ -2392,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2416,7 +2726,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else + } else if (*partial_cluster > 0) *partial_cluster = 0; err = ext4_ext_dirty(handle, inode, path + depth); @@ -2434,17 +2744,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * If there is still a entry in the leaf node, check to see if - * it references the partial cluster. This is the only place - * where it could; if it doesn't, we can free the cluster. + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If there's a partial cluster and no extents + * remain in the leaf, it can't be freed here. It can only be + * freed when it's possible to determine if it's not shared with + * any other extent - when the next leaf is processed or when space + * removal is complete. */ - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + if (*partial_cluster > 0 && eh->eh_entries && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), @@ -2455,7 +2766,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) - err = ext4_ext_rm_idx(handle, inode, path + depth); + err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; @@ -2482,51 +2793,122 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); - struct ext4_ext_path *path; - ext4_fsblk_t partial_cluster = 0; + struct ext4_ext_path *path = NULL; + long long partial_cluster = 0; handle_t *handle; - int i, err; + int i = 0, err = 0; - ext_debug("truncate since %u\n", start); + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, depth + 1); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); if (IS_ERR(handle)) return PTR_ERR(handle); again: - ext4_ext_invalidate_cache(inode); + trace_ext4_ext_remove_space(inode, start, end, depth); + + /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block; + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ + ex = path[depth].p_ext; + if (!ex) { + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EIO; + } + goto out; + } - trace_ext4_ext_remove_space(inode, start, depth); + ee_block = le32_to_cpu(ex->ee_block); + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + int split_flag = 0; + + if (ext4_ext_is_unwritten(ex)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. + */ + err = ext4_split_extent_at(handle, inode, path, + end + 1, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + + if (err < 0) + goto out; + } + } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ depth = ext_depth(inode); - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); - if (path == NULL) { - ext4_journal_stop(handle); - return -ENOMEM; - } - path[0].p_depth = depth; - path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { - err = -EIO; - goto out; + if (path) { + int k = i = depth; + while (--k > 0) + path[k].p_block = + le16_to_cpu(path[k].p_hdr->eh_entries)+1; + } else { + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + i = 0; + + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { + err = -EIO; + goto out; + } } - i = err = 0; + err = 0; while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial_cluster, start, - EXT_MAX_BLOCKS - 1); + end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2561,21 +2943,21 @@ again: ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { /* should we reset i_size? */ - err = -EIO; + err = PTR_ERR(bh); break; } + /* Yield here to deal with large extent trees. + * Should be a no-op if we did IO above. */ + cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EIO; break; } - if (ext4_ext_check(inode, ext_block_hdr(bh), - depth - i - 1)) { - err = -EIO; - break; - } path[i + 1].p_bh = bh; /* save actual number of indexes since this @@ -2588,7 +2970,7 @@ again: /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ - err = ext4_ext_rm_idx(handle, inode, path + i); + err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); @@ -2598,17 +2980,14 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, - path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, + partial_cluster, path->p_hdr->eh_entries); /* If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial * cluster as well. */ - if (partial_cluster && path->p_hdr->eh_entries == 0) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(EXT4_SB(sb), partial_cluster), @@ -2633,8 +3012,10 @@ again: out: ext4_ext_drop_refs(path); kfree(path); - if (err == -EAGAIN) + if (err == -EAGAIN) { + path = NULL; goto again; + } ext4_journal_stop(handle); return err; @@ -2651,17 +3032,17 @@ void ext4_ext_init(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) - printk(KERN_INFO "EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST - printk(", aggressive tests"); + ", aggressive tests" #endif #ifdef CHECK_BINSEARCH - printk(", check binsearch"); + ", check binsearch" #endif #ifdef EXTENTS_STATS - printk(", stats"); + ", stats" #endif - printk("\n"); + "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2691,6 +3072,23 @@ void ext4_ext_release(struct super_block *sb) #endif } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { @@ -2709,14 +3107,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } /* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -/* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle @@ -2724,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -2746,11 +3136,14 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; + BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == + (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); + ext_debug("ext4_split_extents_at: inode %lu, logical" "block %llu\n", inode->i_ino, (unsigned long long)split); @@ -2763,6 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_unwritten(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2774,23 +3171,23 @@ static int ext4_split_extent_at(handle_t *handle, * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -2804,18 +3201,48 @@ static int ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - err = ext4_ext_zeroout(inode, &orig_ex); + if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { + if (split_flag & EXT4_EXT_DATA_VALID1) { + err = ext4_ext_zeroout(inode, ex2); + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { + err = ext4_ext_zeroout(inode, ex); + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { + err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } + if (err) goto fix_extent_len; /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le32(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_zeroout_es(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -2834,7 +3261,7 @@ fix_extent_len: * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * - * It may result in splitting the extent into multiple extents (upto three) + * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent @@ -2852,40 +3279,56 @@ static int ext4_split_extent(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + if (split_flag & EXT4_EXT_DATA_VALID2) + split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + unwritten = ext4_ext_is_unwritten(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNWRIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) @@ -2894,22 +3337,21 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } -#define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -2921,134 +3363,187 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, + int flags) { + struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + (unsigned long long)map->m_lblk, map_len); + sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. * * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. + * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. - * - L3: we only attempt to merge with an extent stored in the + * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; + unsigned int prev_len; - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); + map, ex, abut_ex); + + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_unwritten(ex); /* Restore the flag */ - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; + allocated = map_len; } } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -3062,9 +3557,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (max_zeroout && (allocated > map->m_len)) { + if (allocated <= max_zeroout) { /* case 3 */ zero_ex.ee_block = cpu_to_le32(map->m_lblk); @@ -3076,9 +3570,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { /* case 2 */ if (map->m_lblk != ee_block) { zero_ex.ee_block = ex->ee_block; @@ -3098,37 +3590,42 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, flags); if (allocated < 0) err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple /initialized uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, @@ -3140,9 +3637,9 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3157,28 +3654,125 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); } +static int ext4_convert_initialized_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path) + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); + (unsigned long long)ee_block, ee_len); + + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ + if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3189,10 +3783,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; @@ -3224,11 +3818,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, depth = ext_depth(inode); eh = path[depth].p_hdr; - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - return -EIO; - } + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; last_ex = EXT_LAST_EXTENT(eh); /* * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3252,6 +3848,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, for (i = depth-1; i >= 0; i--) if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) return 0; +out: ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); return ext4_mark_inode_dirty(handle, inode); } @@ -3259,115 +3856,34 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, /** * ext4_find_delalloc_range: find delayed allocated block in the given range. * - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns - * whether there are any buffers marked for delayed allocation. It returns '1' - * on the first delalloc'ed buffer head found. If no buffer head in the given - * range is marked for delalloc, it returns 0. - * lblk_start should always be <= lblk_end. - * search_hint_reverse is to indicate that searching in reverse from lblk_end to - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed - * block sooner). This is useful when blocks are truncated sequentially from - * lblk_start towards lblk_end. + * Return 1 if there is a delalloc block in the range, otherwise 0. */ -static int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end, - int search_hint_reverse) +int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end) { - struct address_space *mapping = inode->i_mapping; - struct buffer_head *head, *bh = NULL; - struct page *page; - ext4_lblk_t i, pg_lblk; - pgoff_t index; + struct extent_status es; - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; - - /* reverse search wont work if fs block size is less than page size */ - if (inode->i_blkbits < PAGE_CACHE_SHIFT) - search_hint_reverse = 0; - - if (search_hint_reverse) - i = lblk_end; + ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); + if (es.es_len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) + return 1; + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) + return 1; else - i = lblk_start; - - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - - while ((i >= lblk_start) && (i <= lblk_end)) { - page = find_get_page(mapping, index); - if (!page) - goto nextpage; - - if (!page_has_buffers(page)) - goto nextpage; - - head = page_buffers(page); - if (!head) - goto nextpage; - - bh = head; - pg_lblk = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - do { - if (unlikely(pg_lblk < lblk_start)) { - /* - * This is possible when fs block size is less - * than page size and our cluster starts/ends in - * middle of the page. So we need to skip the - * initial few blocks till we reach the 'lblk' - */ - pg_lblk++; - continue; - } - - /* Check if the buffer is delayed allocated and that it - * is not yet mapped. (when da-buffers are mapped during - * their writeout, their da_mapped bit is set.) - */ - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 1, i); - return 1; - } - if (search_hint_reverse) - i--; - else - i++; - } while ((i >= lblk_start) && (i <= lblk_end) && - ((bh = bh->b_this_page) != head)); -nextpage: - if (page) - page_cache_release(page); - /* - * Move to next page. 'i' will be the first lblk in the next - * page. - */ - if (search_hint_reverse) - index--; - else - index++; - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - } - - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse, 0, 0); - return 0; + return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse) +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse); + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); } /** @@ -3423,22 +3939,22 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3446,28 +3962,68 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + ret = ext4_convert_initialized_extents(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + + return err ? err : allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int allocated, ext4_fsblk_t newblock) { int ret = 0; int err = 0; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, - newblock); + /* + * When writing into unwritten space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, + allocated, newblock); /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + ret = ext4_split_convert_extents(handle, inode, map, + path, flags | EXT4_GET_BLOCKS_CONVERT); + if (ret <= 0) + goto out; /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to conversion to written when IO is @@ -3477,13 +4033,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_set_io_unwritten_flag(inode, io); else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; + map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { - ret = ext4_convert_unwritten_extents_endio(handle, inode, + if (flags & EXT4_GET_BLOCKS_CONVERT) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, map, path); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3491,6 +4046,11 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ @@ -3498,8 +4058,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { + map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; + } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3515,7 +4077,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -3538,6 +4100,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already @@ -3571,10 +4134,6 @@ out1: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -3625,7 +4184,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -3643,8 +4202,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -3710,48 +4268,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; - unsigned int punched_out = 0; - unsigned int result = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; + int set_unwritten = 0; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - /* check in cache */ - if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && - ext4_ext_in_cache(inode, map->m_lblk, &newex)) { - if (!newex.ee_start_lo && !newex.ee_start_hi) { - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * block isn't allocated yet and - * user doesn't want to allocate it - */ - goto out2; - } - /* we should allocate requested block */ - } else { - /* block is already allocated */ - if (sbi->s_cluster_ratio > 1) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - newblock = map->m_lblk - - le32_to_cpu(newex.ee_block) - + ext4_ext_pblock(&newex); - /* number of remaining blocks in the extent */ - allocated = ext4_ext_get_actual_len(&newex) - - (map->m_lblk - le32_to_cpu(newex.ee_block)); - goto out; - } - } - /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -3780,8 +4307,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -3790,118 +4318,38 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { - struct ext4_map_blocks punch_map; - ext4_fsblk_t partial_cluster = 0; - newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - - /* - * Punch out the map length, but only to the - * end of the extent - */ - punched_out = allocated < map->m_len ? - allocated : map->m_len; - /* - * Sense extents need to be converted to - * uninitialized, they must fit in an - * uninitialized extent + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. */ - if (punched_out > EXT_UNINIT_MAX_LEN) - punched_out = EXT_UNINIT_MAX_LEN; - - punch_map.m_lblk = map->m_lblk; - punch_map.m_pblk = newblock; - punch_map.m_len = punched_out; - punch_map.m_flags = 0; - - /* Check to see if the extent needs to be split */ - if (punch_map.m_len != ee_len || - punch_map.m_lblk != ee_block) { - - ret = ext4_split_extent(handle, inode, - path, &punch_map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (ret < 0) { - err = ret; - goto out2; - } - /* - * find extent for the block at - * the start of the hole - */ - ext4_ext_drop_refs(path); - kfree(path); - - path = ext4_ext_find_extent(inode, - map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - ee_block = le32_to_cpu(ex->ee_block); - ee_start = ext4_ext_pblock(ex); - - } - - ext4_ext_mark_uninitialized(ex); - - ext4_ext_invalidate_cache(inode); - - err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, map->m_lblk, - map->m_lblk + punched_out); - - if (!err && path->p_hdr->eh_entries == 0) { - /* - * Punch hole freed all of this sub tree, - * so we need to correct eh_depth - */ - err = ext4_ext_get_access(handle, inode, path); - if (err == 0) { - ext_inode_hdr(inode)->eh_depth = 0; - ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root( - inode, 0)); - - err = ext4_ext_dirty( - handle, inode, path); - } - } + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = ext4_ext_convert_initialized_extent( + handle, inode, map, path, flags, + allocated, newblock); + goto out2; + } else if (!ext4_ext_is_unwritten(ex)) + goto out; + ret = ext4_ext_handle_unwritten_extents( + handle, inode, map, path, flags, + allocated, newblock); + if (ret < 0) + err = ret; + else + allocated = ret; goto out2; } } if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; /* @@ -3913,7 +4361,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } @@ -3922,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -3960,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -3990,7 +4439,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; @@ -4016,25 +4465,19 @@ got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); + map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; + if (flags & EXT4_GET_BLOCKS_PRE_IO) + set_unwritten = 1; } err = 0; @@ -4044,6 +4487,15 @@ got_allocated_blocks: if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + + if (!err && set_unwritten) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; @@ -4051,8 +4503,8 @@ got_allocated_blocks: /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), fb_flags); + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), fb_flags); goto out2; } @@ -4088,9 +4540,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4141,17 +4590,25 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); - } else + else ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > map->m_len) @@ -4165,58 +4622,20 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? - punched_out : allocated; - - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : result); - return err ? err : result; + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); + ext4_es_lru_add(inode); + return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; int err = 0; /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_completed_IO(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); - - ext4_discard_preallocations(inode); - - /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. @@ -4228,60 +4647,226 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); +retry: + err = ext4_es_remove_extent(inode, last_block, + EXT_MAX_BLOCKS - last_block); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) { + ext4_std_error(inode->i_sb, err); + return; + } + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + ext4_std_error(inode->i_sb, err); +} - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, int flags, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; - up_write(&EXT4_I(inode)->i_data_sem); + map.m_lblk = offset; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -out_stop: /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. + * credits to insert 1 extent into extent tree */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); + credits = ext4_chunk_trans_blocks(inode, len); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); +retry: + while (ret >= 0 && ret < len) { + map.m_lblk = map.m_lblk + ret; + map.m_len = len = len - ret; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int partial; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + trace_ext4_zero_range(inode, offset, len, mode); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; } + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + /* - * Update only when preallocation was requested beyond - * the file size. + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + /* + * If we have a partial block after EOF we have to allocate + * the entire block. + */ + if (partial) + max_blocks += 1; + } + + if (max_blocks > 0) { + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Remove entire range from the extent status tree. + */ + ret = ext4_es_remove_extent(inode, lblk, max_blocks); + if (ret) + goto out_dio; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, + mode); + if (ret) + goto out_dio; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { if (new_size > i_size_read(inode)) i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); } else { /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (new_size > i_size_read(inode)) + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4293,16 +4878,27 @@ static void ext4_falloc_update_inode(struct inode *inode, */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + struct timespec tv; + unsigned int blkbits = inode->i_blkbits; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(inode, offset, len); + + ret = ext4_convert_inline_data(inode); + if (ret) + return ret; /* * currently supporting (pre)allocate mode for extent-based @@ -4311,86 +4907,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); - if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + - lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, max_blocks); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + mutex_lock(&inode->i_mutex); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + if (ret) + goto out; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + goto out; + + tv = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { + if (new_size > i_size_read(inode)) { + i_size_write(inode, new_size); + inode->i_mtime = tv; + } + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4403,10 +4982,9 @@ retry: * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) { - handle_t *handle; unsigned int max_blocks; int ret = 0; int ret2 = 0; @@ -4421,235 +4999,97 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - map.m_lblk); /* - * credits to insert 1 extent into extent tree + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + credits = 0; + } else { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); - if (ret <= 0) { - WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, map.m_len); - } + if (ret <= 0) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret <= 0 || ret2 ) + if (credits) + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2) break; } + if (!credits) + ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } /* - * Callback function called for each extent to gather FIEMAP information. + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and + * return start of the next delayed extent. + * + * If newes is existing extent (newes->ec_pblk is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newes unmodified. */ -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, - struct ext4_ext_cache *newex, struct ext4_extent *ex, - void *data) +static int ext4_find_delayed_extent(struct inode *inode, + struct extent_status *newes) { - __u64 logical; - __u64 physical; - __u64 length; - __u32 flags = 0; - int ret = 0; - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits; - - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; - - if (newex->ec_start == 0) { + struct extent_status es; + ext4_lblk_t block, next_del; + + if (newes->es_pblk == 0) { + ext4_es_find_delayed_extent_range(inode, newes->es_lblk, + newes->es_lblk + newes->es_len - 1, &es); + /* - * No extent in extent-tree contains block @newex->ec_start, + * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. - * - * Holes or delayed-extents are processed as follows. - * 1. lookup dirty pages with specified range in pagecache. - * If no page is got, then there is no delayed-extent and - * return with EXT_CONTINUE. - * 2. find the 1st mapped buffer, - * 3. check if the mapped buffer is both in the request range - * and a delayed buffer. If not, there is no delayed-extent, - * then return. - * 4. a delayed-extent is found, the extent will be collected. */ - ext4_lblk_t end = 0; - pgoff_t last_offset; - pgoff_t offset; - pgoff_t index; - pgoff_t start_index = 0; - struct page **pages = NULL; - struct buffer_head *bh = NULL; - struct buffer_head *head = NULL; - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); - - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (pages == NULL) - return -ENOMEM; - - offset = logical >> PAGE_SHIFT; -repeat: - last_offset = offset; - head = NULL; - ret = find_get_pages_tag(inode->i_mapping, &offset, - PAGECACHE_TAG_DIRTY, nr_pages, pages); - - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* First time, try to find a mapped buffer. */ - if (ret == 0) { -out: - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - /* just a hole. */ - kfree(pages); - return EXT_CONTINUE; - } - index = 0; - -next_page: - /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> - blksize_bits; - if (!page_has_buffers(pages[index])) - goto out; - head = page_buffers(pages[index]); - if (!head) - goto out; - - index++; - bh = head; - do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; - /* get the 1st mapped buffer. */ - goto found_mapped_buffer; - } - - bh = bh->b_this_page; - end++; - } while (bh != head); - - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; - } else { - /*Find contiguous delayed buffers. */ - if (ret > 0 && pages[0]->index == last_offset) - head = page_buffers(pages[0]); - bh = head; - index = 1; - start_index = 0; - } - -found_mapped_buffer: - if (bh != NULL && buffer_delay(bh)) { - /* 1st or contiguous delayed buffer found. */ - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* - * 1st delayed buffer found, record - * the start of extent. - */ - flags |= FIEMAP_EXTENT_DELALLOC; - newex->ec_block = end; - logical = (__u64)end << blksize_bits; - } - /* Find contiguous delayed buffers. */ - do { - if (!buffer_delay(bh)) - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - - for (; index < ret; index++) { - if (!page_has_buffers(pages[index])) { - bh = NULL; - break; - } - head = page_buffers(pages[index]); - if (!head) { - bh = NULL; - break; - } - - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { - /* Blocks are not contiguous. */ - bh = NULL; - break; - } - bh = head; - do { - if (!buffer_delay(bh)) - /* Delayed-extent ends. */ - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - } - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) - /* a hole found. */ - goto out; + if (es.es_len == 0) + /* A hole found. */ + return 0; -found_delayed_extent: - newex->ec_len = min(end - newex->ec_block, - (ext4_lblk_t)EXT_INIT_MAX_LEN); - if (ret == nr_pages && bh != NULL && - newex->ec_len < EXT_INIT_MAX_LEN && - buffer_delay(bh)) { - /* Have not collected an extent and continue. */ - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - goto repeat; + if (es.es_lblk > newes->es_lblk) { + /* A hole found. */ + newes->es_len = min(es.es_lblk - newes->es_lblk, + newes->es_len); + return 0; } - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - kfree(pages); + newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; } - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - - if (ex && ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - - if (next == EXT_MAX_BLOCKS) - flags |= FIEMAP_EXTENT_LAST; + block = newes->es_lblk + newes->es_len; + ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + if (es.es_len == 0) + next_del = EXT_MAX_BLOCKS; + else + next_del = es.es_lblk; - ret = fiemap_fill_next_extent(fieinfo, logical, physical, - length, flags); - if (ret < 0) - return ret; - if (ret == 1) - return EXT_BREAK; - return EXT_CONTINUE; + return next_del; } /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4671,7 +5111,7 @@ static int ext4_xattr_fiemap(struct inode *inode, error = ext4_get_inode_loc(inode, &iloc); if (error) return error; - physical = iloc.bh->b_blocknr << blockbits; + physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; @@ -4679,7 +5119,7 @@ static int ext4_xattr_fiemap(struct inode *inode, flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); } else { /* external block */ - physical = EXT4_I(inode)->i_file_acl << blockbits; + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; } @@ -4689,260 +5129,384 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) { - struct inode *inode = file->f_path.dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct ext4_ext_cache cache_ex; - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; - struct address_space *mapping = inode->i_mapping; - struct ext4_map_blocks map; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int ret, credits, blocks_released, err = 0; + ext4_lblk_t start_blk; + int error = 0; - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - return 0; + if (ext4_has_inline_data(inode)) { + int has_inline = 1; - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + + if (has_inline) + return error; } - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; + /* fallback to generic here if not in extents fmt */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + ext4_lblk_t len_blks; + __u64 last_blk; - if (err) - return err; - } + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(mapping, first_page_offset, - last_page_offset-1); + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); } + ext4_es_lru_add(inode); + return error; +} - /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; - err = ext4_orphan_add(handle, inode); - if (err) - goto out; + if (!ext4_handle_valid(handle)) + return 0; /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); if (err) goto out; - } - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = le32_to_cpu(ex_last->ee_block) + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; + + if (--depth < 0 || !update) + break; } - } + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); + depth--; + } - if (err) - goto out; - } +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; } - /* If there are no blocks to remove, return now */ - if (first_block >= last_block) - goto out; + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; /* - * Loop over all the blocks and identify blocks - * that need to be punched out + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. */ - iblock = first_block; - blocks_released = 0; - while (iblock < last_block) { - max_blocks = last_block - iblock; - num_blocks = 1; - memset(&map, 0, sizeof(map)); - map.m_lblk = iblock; - map.m_len = max_blocks; - ret = ext4_ext_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (ret > 0) { - blocks_released += ret; - num_blocks = ret; - } else if (ret == 0) { - /* - * If map blocks could not find the block, - * then it is in a hole. If the hole was - * not already cached, then map blocks should - * put it in the cache. So we can get the hole - * out of the cache - */ - memset(&cache_ex, 0, sizeof(cache_ex)); - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && - !cache_ex.ec_start) { + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + ext4_ext_drop_refs(path); + kfree(path); - /* The hole is cached */ - num_blocks = cache_ex.ec_block + - cache_ex.ec_len - iblock; + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; - } else { - /* The block could not be identified */ - err = -EIO; + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; break; } - } else { - /* Map blocks error */ - err = ret; - break; } - - if (num_blocks == 0) { - /* This condition should never happen */ - ext_debug("Block lookup failed"); - err = -EIO; + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) break; - } + } - iblock += num_blocks; + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || + len & (EXT4_BLOCK_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; } - if (blocks_released > 0) { - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } - up_write(&EXT4_I(inode)->i_data_sem); + truncate_pagecache(inode, ioffset); -out: - ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - return err; -} -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - ext4_lblk_t start_blk; - int error = 0; + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); - /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } - if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) - return -EBADR; + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { - error = ext4_xattr_fiemap(inode, fieinfo); - } else { - ext4_lblk_t len_blks; - __u64 last_blk; + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } - start_blk = start >> inode->i_sb->s_blocksize_bits; - last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCKS) - last_blk = EXT_MAX_BLOCKS-1; - len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); - /* - * Walk the extent tree gathering extent information. - * ext4_ext_fiemap_cb will push extents back to user. - */ - error = ext4_ext_walk_space(inode, start_blk, len_blks, - ext4_ext_fiemap_cb, fieinfo); + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; } - return error; + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c new file mode 100644 index 00000000000..0b7e28e7eaa --- /dev/null +++ b/fs/ext4/extents_status.c @@ -0,0 +1,1127 @@ +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Hugh Dickins <hughd@google.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + * Ext4 extents status tree core functions. + */ +#include <linux/rbtree.h> +#include <linux/list_sort.h> +#include "ext4.h" +#include "extents_status.h" + +#include <trace/events/ext4.h> + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delayed extents in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. + * + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. + */ + +/* + * Extent status tree implementation for ext4. + * + * + * ========================================================================== + * Extent status tree tracks all extent status. + * + * 1. Why we need to implement extent status tree? + * + * Without extent status tree, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. + * + * Let us have a look at how they do without extent status tree. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time comsuming. + * + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the extent tree. + * + * + * ========================================================================== + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. + * + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. + * + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. + * + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. + * + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. + * + * + * ========================================================================== + * 3. Performance analysis + * + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 4. TODO list + * + * -- Refactor delayed space reservation + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); + +int __init ext4_init_es(void) +{ + ext4_es_cachep = kmem_cache_create("ext4_extent_status", + sizeof(struct extent_status), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + if (ext4_es_cachep) + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u) %llu %x", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) +{ + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t lblk) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (lblk < es->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && lblk < es->es_lblk) + return es; + + if (es && lblk > ext4_es_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * + * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search + * @end: the offset where we stop to search + * @es: delayed extent that we found + */ +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + + BUG_ON(es == NULL); + BUG_ON(end < lblk); + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; + } + } + + es1 = __es_tree_search(&tree->root, lblk); + +out: + if (es1 && !ext4_es_is_delayed(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } + if (ext4_es_is_delayed(es1)) + break; + } + } + + if (es1 && ext4_es_is_delayed(es1)) { + tree->cache_es = es1; + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_delayed_extent_range_exit(inode, es); +} + +static struct extent_status * +ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) +{ + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) + return NULL; + es->es_lblk = lblk; + es->es_len = len; + es->es_pblk = pblk; + + /* + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { + EXT4_I(inode)->i_es_lru_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + } + + return es; +} + +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) +{ + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +} + +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (ext4_es_status(es1) != ext4_es_status(es2)) + return 0; + + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { + pr_warn("ES assertion failed when merging extents. " + "The sum of lengths of es1 (%d) and es2 (%d) " + "is bigger than allowed file size (%d)\n", + es1->es_len, es2->es_len, EXT_MAX_BLOCKS); + WARN_ON(1); + return 0; + } + + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; + rb_erase(node, &tree->root); + ext4_es_free_extent(inode, es1); + } + + return es; +} + +#ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertion failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertion failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertion failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertion failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertion failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertion failed for inode: %lu " + "We can't find the block but we want to add " + "a written extent [%d/%d/%llu/%x]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); + es = ext4_es_try_to_merge_left(inode, es); + goto out; + } + p = &(*p)->rb_left; + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; + es = ext4_es_try_to_merge_right(inode, es); + goto out; + } + p = &(*p)->rb_right; + } else { + BUG_ON(1); + return -EINVAL; + } + } + + es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, + newes->es_pblk); + if (!es) + return -ENOMEM; + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds information to an inode's extent + * status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + int err = 0; + + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", + lblk, len, pblk, status, inode->i_ino); + + if (!len) + return 0; + + BUG_ON(end < lblk); + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_insert_extent(inode, &newes); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + + return err; +} + +/* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock_status(&newes, pblk, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* + * ext4_es_lookup_extent() looks up an extent in extent status tree. + * + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not + */ +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) +{ + struct ext4_es_tree *tree; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node *node; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2; + ext4_fsblk_t block; + int err; + +retry: + err = 0; + es = __es_tree_search(&tree->root, lblk); + if (!es) + goto out; + if (es->es_lblk > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; + if (len1 > 0) + es->es_len = len1; + if (len2 > 0) { + if (len1 > 0) { + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + block = 0x7FDEADBEEFULL; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); + err = __es_insert_extent(inode, &newes); + if (err) { + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && + __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, + EXT4_I(inode))) + goto retry; + goto out; + } + } else { + es->es_lblk = end + 1; + es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && ext4_es_end(es) <= end) { + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + + len1 = ext4_es_end(es) - end; + es->es_lblk = end + 1; + es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } + } + +out: + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + ext4_lblk_t end; + int err = 0; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + if (!len) + return err; + + end = lblk + len - 1; + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); + write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_print_tree(inode); + return err; +} + +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct ext4_inode_info *eia, *eib; + eia = list_entry(a, struct ext4_inode_info, i_es_lru); + eib = list_entry(b, struct ext4_inode_info, i_es_lru); + + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return 1; + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return -1; + if (eia->i_touch_when == eib->i_touch_when) + return 0; + if (time_after(eia->i_touch_when, eib->i_touch_when)) + return 1; + else + return -1; +} + +static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) +{ + struct ext4_inode_info *ei; + struct list_head *cur, *tmp; + LIST_HEAD(skipped); + int nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; + + spin_lock(&sbi->s_es_lru_lock); + +retry: + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + int shrunk; + + /* + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ + if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + break; + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + + /* + * Skip the inode that is newer than the last_sorted + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ + if ((sbi->s_es_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; + list_move_tail(cur, &skipped); + continue; + } + + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) + continue; + + shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + if (ei->i_es_lru_nr == 0) + list_del_init(&ei->i_es_lru); + write_unlock(&ei->i_es_lock); + + nr_shrunk += shrunk; + nr_to_scan -= shrunk; + if (nr_to_scan == 0) + break; + } + + /* Move the newer inodes into the tail of the LRU list. */ + list_splice_tail(&skipped, &sbi->s_es_lru); + INIT_LIST_HEAD(&skipped); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, sort the list and try again. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* + * If there are no non-precached inodes left on the + * list, start releasing precached extents. + */ + if (ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) + skip_precached = 0; + goto retry; + } + + spin_unlock(&sbi->s_es_lru_lock); + + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + + return nr_shrunk; +} + +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + + nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return nr_shrunk; +} + +void ext4_es_register_shrinker(struct ext4_sb_info *sbi) +{ + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_last_sorted = 0; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; + sbi->s_es_shrinker.count_objects = ext4_es_count; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) +{ + unregister_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_lru_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + ei->i_touch_when = jiffies; + + if (!list_empty(&ei->i_es_lru)) + return; + + spin_lock(&sbi->s_es_lru_lock); + if (list_empty(&ei->i_es_lru)) + list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +void ext4_es_lru_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (!list_empty(&ei->i_es_lru)) + list_del_init(&ei->i_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct rb_node *node; + struct extent_status *es; + unsigned long nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (ei->i_es_lru_nr == 0) + return 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + + node = rb_first(&tree->root); + while (node != NULL) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + nr_shrunk++; + if (--nr_to_scan == 0) + break; + } + } + tree->cache_es = NULL; + return nr_shrunk; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h new file mode 100644 index 00000000000..f1b62a41992 --- /dev/null +++ b/fs/ext4/extents_status.h @@ -0,0 +1,146 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +#define ES_SHIFT 60 + +#define EXTENT_STATUS_WRITTEN (1 << 3) +#define EXTENT_STATUS_UNWRITTEN (1 << 2) +#define EXTENT_STATUS_DELAYED (1 << 1) +#define EXTENT_STATUS_HOLE (1 << 0) + +#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) + +#define ES_WRITTEN (1ULL << 63) +#define ES_UNWRITTEN (1ULL << 62) +#define ES_DELAYED (1ULL << 61) +#define ES_HOLE (1ULL << 60) + +#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ + ES_DELAYED | ES_HOLE) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (es->es_pblk & ES_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (es->es_pblk & ES_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (es->es_pblk & ES_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (es->es_pblk & ES_HOLE) != 0; +} + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (es->es_pblk & ~ES_MASK)); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (pb & ~ES_MASK)); +} + +extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_lru_add(struct inode *inode); +extern void ext4_es_lru_del(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cb70f1812a7..8695f70af1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,7 +23,9 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> +#include <linux/pagevec.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -55,11 +57,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_aiodio_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* @@ -72,78 +74,128 @@ static void ext4_aiodio_wait(struct inode *inode) * or one thread will zero the other's data, causing corruption. */ static int -ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - size_t count = iov_length(iov, nr_segs); - loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; - if ((pos & blockmask) || (final_size & blockmask)) + if ((pos | iov_iter_alignment(from)) & blockmask) return 1; return 0; } static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; - int unaligned_aio = 0; - int ret; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(iocb->ki_filp); + struct mutex *aio_mutex = NULL; + struct blk_plug plug; + int o_direct = file->f_flags & O_DIRECT; + int overwrite = 0; + size_t length = iov_iter_count(from); + ssize_t ret; + loff_t pos = iocb->ki_pos; + + /* + * Unaligned direct AIO must be serialized; see comment above + * In the case of O_APPEND, assume that we must always serialize + */ + if (o_direct && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && + (file->f_flags & O_APPEND || + ext4_unaligned_aio(inode, from, pos))) { + aio_mutex = ext4_aio_mutex(inode); + mutex_lock(aio_mutex); + ext4_unwritten_wait(inode); + } + + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + iocb->ki_pos = pos = i_size_read(inode); /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - size_t length = iov_length(iov, nr_segs); - - if ((pos > sbi->s_bitmap_maxbytes || - (pos == sbi->s_bitmap_maxbytes && length > 0))) - return -EFBIG; - if (pos + length > sbi->s_bitmap_maxbytes) { - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, - sbi->s_bitmap_maxbytes - pos); + if ((pos > sbi->s_bitmap_maxbytes) || + (pos == sbi->s_bitmap_maxbytes && length > 0)) { + mutex_unlock(&inode->i_mutex); + ret = -EFBIG; + goto errout; } - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && - !is_sync_kiocb(iocb))) { - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); + + if (pos + length > sbi->s_bitmap_maxbytes) + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); } - /* Unaligned direct AIO must be serialized; see comment above */ - if (unaligned_aio) { - static unsigned long unaligned_warn_time; - - /* Warn about this once per day */ - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) - ext4_msg(inode->i_sb, KERN_WARNING, - "Unaligned AIO/DIO on inode %ld by %s; " - "performance will be poor.", - inode->i_ino, current->comm); - mutex_lock(ext4_aio_mutex(inode)); - ext4_aiodio_wait(inode); + if (o_direct) { + blk_start_plug(&plug); + + iocb->private = &overwrite; + + /* check whether we do a DIO overwrite or not */ + if (ext4_should_dioread_nolock(inode) && !aio_mutex && + !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, len; + + map.m_lblk = pos >> blkbits; + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) + - map.m_lblk; + len = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of blocks has + * been preallocated no matter they are + * initialized or not. For excluding + * unwritten extents, we need to check + * m_flags. There are two conditions that + * indicate for initialized extents. 1) If we + * hit extent cache, EXT4_MAP_MAPPED flag is + * returned; 2) If we do a real lookup, + * non-flags are returned. So we should check + * these two conditions. + */ + if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) + overwrite = 1; + } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + ret = __generic_file_write_iter(iocb, from); + mutex_unlock(&inode->i_mutex); - if (unaligned_aio) - mutex_unlock(ext4_aio_mutex(inode)); + if (ret > 0) { + ssize_t err; + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + if (o_direct) + blk_finish_plug(&plug); + +errout: + if (aio_mutex) + mutex_unlock(aio_mutex); return ret; } static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -154,7 +206,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -162,7 +213,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -181,41 +231,336 @@ static int ext4_file_open(struct inode * inode, struct file * filp) path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); if (!IS_ERR(cp)) { + handle_t *handle; + int err; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) { + ext4_journal_stop(handle); + return err; + } strlcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); - ext4_mark_super_dirty(sb); + ext4_handle_dirty_super(handle, sb); + ext4_journal_stop(handle); } } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; + if (filp->f_mode & FMODE_WRITE) { + int ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; + } + return dquot_file_open(inode, filp); +} + +/* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int whence, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (whence == SEEK_DATA) + break; + + BUG_ON(whence != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && whence == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && whence == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (whence == SEEK_DATA) + found = 1; + } else { + if (whence == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && whence == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; } - return dquot_file_open(inode, filp); + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = (loff_t)last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + ext4_es_find_delayed_extent_range(inode, last, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + if (last != start) + dataoff = (loff_t)last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = (loff_t)last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + return vfs_setpos(file, dataoff, maxsize); +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = (loff_t)last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + ext4_es_find_delayed_extent_range(inode, last, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + last = es.es_lblk + es.es_len; + holeoff = (loff_t)last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = (loff_t)last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + return vfs_setpos(file, holeoff, maxsize); } /* - * ext4_llseek() copied from generic_file_llseek() to handle both - * block-mapped and extent-mapped maxbytes values. This should - * otherwise be identical with generic_file_llseek(). + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. */ -loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes; @@ -225,15 +570,27 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, maxbytes); + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ext4_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, @@ -243,20 +600,19 @@ const struct file_operations ext4_file_operations = { .release = ext4_release_file, .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 00a2cb753ef..a8bc47f75fa 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -34,85 +34,6 @@ #include <trace/events/ext4.h> -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4FS_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int ext4_flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - list_del_init(&io->list); - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * conversion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - if (ret < 0) - ret2 = ret; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since @@ -123,7 +44,6 @@ int ext4_flush_completed_IO(struct inode *inode) */ static int ext4_sync_parent(struct inode *inode) { - struct writeback_control wbc; struct dentry *dentry = NULL; struct inode *next; int ret = 0; @@ -133,14 +53,7 @@ static int ext4_sync_parent(struct inode *inode) inode = igrab(inode); while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = NULL; - spin_lock(&inode->i_lock); - if (!list_empty(&inode->i_dentry)) { - dentry = list_first_entry(&inode->i_dentry, - struct dentry, d_alias); - dget(dentry); - } - spin_unlock(&inode->i_lock); + dentry = d_find_any_alias(inode); if (!dentry) break; next = igrab(dentry->d_parent->d_inode); @@ -152,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode) ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; - memset(&wbc, 0, sizeof(wbc)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.nr_to_write = 0; /* only write out the inode */ - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); if (ret) break; } @@ -163,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode) return ret; } -/** - * __sync_file - generic_file_fsync without the locking and filemap_write - * @inode: inode to sync - * @datasync: only sync essential metadata if true - * - * This is just generic_file_fsync without the locking. This is needed for - * nojournal mode to make sure this inodes data/metadata makes it to disk - * properly. The i_mutex should be held already. - */ -static int __sync_inode(struct inode *inode, int datasync) -{ - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode_metadata(inode, 1); - if (ret == 0) - ret = err; - return ret; -} - /* * akpm: A new design for ext4_sync_file(). * @@ -199,8 +83,6 @@ static int __sync_inode(struct inode *inode, int datasync) * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. - * - * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) @@ -208,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret; + int ret = 0, err; tid_t commit_tid; bool needs_barrier = false; @@ -216,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - if (inode->i_sb->s_flags & MS_RDONLY) - goto out; - - ret = ext4_flush_completed_IO(inode); - if (ret < 0) + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ret = -EROFS; goto out; + } if (!journal) { - ret = __sync_inode(inode, datasync); - if (!ret && !list_empty(&inode->i_dentry)) + ret = generic_file_fsync(file, start, end, datasync); + if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -258,12 +139,13 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - out: - mutex_unlock(&inode->i_mutex); + ret = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } +out: trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index ac8f168c8ab..3d586f02883 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) /* Check to see if the seed is all zero's */ if (hinfo->seed) { for (i = 0; i < 4; i++) { - if (hinfo->seed[i]) + if (hinfo->seed[i]) { + memcpy(buf, hinfo->seed, sizeof(buf)); break; + } } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); } switch (hinfo->hash_version) { @@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT4_HTREE_EOF << 1)) - hash = (EXT4_HTREE_EOF-1) << 1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25d8c9781ad..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,28 +70,49 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); return EXT4_INODES_PER_GROUP(sb); } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -104,6 +125,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -118,12 +141,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return NULL; } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); @@ -131,6 +154,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; @@ -144,23 +168,45 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return NULL; } + +verify: + ext4_lock_group(sb, block_group); + if (!buffer_verified(bh) && + !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + put_bh(bh); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return NULL; + } + ext4_unlock_group(sb, block_group); + set_buffer_verified(bh); return bh; } @@ -193,20 +239,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; - if (atomic_read(&inode->i_count) > 1) { - printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); return; } - if (inode->i_nlink) { - printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); return; } - if (!sb) { - printk(KERN_ERR "ext4_free_inode: inode on " - "nonexistent device\n"); + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); @@ -237,7 +285,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -265,7 +315,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_used_dirs_set(sb, gdp, count); percpu_counter_dec(&sbi->s_dirs_counter); } - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); @@ -284,9 +336,16 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - ext4_mark_super_dirty(sb); - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -294,8 +353,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -312,7 +371,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -396,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(qstr->name, qstr->len, &hinfo); grp = hinfo.hash; } else - get_random_bytes(&grp, sizeof(grp)); + grp = prandom_u32(); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -477,10 +536,12 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - grp_free = ext4_free_inodes_count(sb, desc); - if (desc && grp_free && grp_free >= avefreei) { - *group = grp; - return 0; + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } } } @@ -593,91 +654,48 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) */ -static int ext4_claim_inode(struct super_block *sb, - struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, umode_t mode) -{ - int free = 0, retval = 0, count; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - - /* - * We have to be sure that new inode allocation does not race with - * inode table initialization, because otherwise we may end up - * allocating and writing new inode right before sb_issue_zeroout - * takes place and overwriting our new inode with zeroes. So we - * take alloc_sem to prevent it. - */ - down_read(&grp->alloc_sem); - ext4_lock_group(sb, group); - if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { - /* not a free inode */ - retval = 1; - goto err_ret; - } - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - ext4_error(sb, "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - return 1; - } - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * + * If the block is not in the buffer cache, then it + * must have been written out. */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); + goto out; - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - return retval; + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; } /* @@ -690,8 +708,10 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -706,6 +726,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -720,6 +741,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -741,65 +779,87 @@ got_group: if (ret2 == -1) goto out; + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto fail; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - - if (ino < EXT4_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - group_desc_bh); - if (err) - goto fail; - if (!ext4_claim_inode(sb, inode_bitmap_bh, - ino, group, mode)) { - /* we won it */ - BUFFER_TRACE(inode_bitmap_bh, - "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, - NULL, - inode_bitmap_bh); - if (err) - goto fail; - /* zero bit is inode number 1*/ - ino++; - goto got; + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks, + 0); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; } - /* we lost it */ - ext4_handle_release_buffer(handle, inode_bitmap_bh); - ext4_handle_release_buffer(handle, group_desc_bh); - - if (++ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ +next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; +next_group: if (++group == ngroups) group = 0; } @@ -807,8 +867,22 @@ repeat_in_this_group: goto out; got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; @@ -817,12 +891,12 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -830,38 +904,76 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, - gdp); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } + + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - inode->i_uid = owner[0]; - inode->i_gid = owner[1]; - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -890,19 +1002,36 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -936,24 +1065,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); @@ -1012,17 +1134,17 @@ iget_failed: inode = NULL; bad_orphan: ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); + printk(KERN_WARNING "inode=%p\n", inode); if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -1057,7 +1179,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) if (!bitmap_bh) continue; - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; @@ -1101,7 +1224,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) @@ -1131,7 +1254,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1149,9 +1272,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u\n" - "Used itable blocks: %d" - "itable unused count: %u\n", + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; @@ -1186,7 +1309,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 830e1b2bf14..fd69da19482 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" @@ -145,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ @@ -153,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, goto no_block; while (--depth) { bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) + if (unlikely(!bh)) { + ret = -ENOMEM; goto failure; + } if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { @@ -176,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, return NULL; failure: - *err = -EIO; + *err = ret; no_block: return p; } @@ -288,132 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, } /** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction * @inode: owner @@ -445,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; + struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; - branch[0].key = cpu_to_le32(new_blocks[0]); /* - * metadata blocks and data blocks are allocated. + * Set up for the direct block allocation */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = *blks; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + ar.goal = goal; + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + } else + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, + goal, 0, NULL, &err); + if (err) { + i--; goto failed; } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; - branch[n].bh = bh; + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ unlock_buffer(bh); goto failed; } - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar.len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -508,25 +385,22 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (err) goto failed; } - *blks = num; - return err; + *blks = ar.len; + return 0; failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { + for (; i >= 0; i--) { /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); + if (i > 0 && i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar.len : 1, 0); } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - return err; } @@ -755,8 +629,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } @@ -772,8 +645,7 @@ out: * VFS code falls back into buffered path in that case so we are safe. */ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -781,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; if (rw == WRITE) { @@ -789,7 +661,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, if (final_size > inode->i_size) { /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -807,22 +679,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(!list_empty(&ei->i_completed_io_list))) { - mutex_lock(&inode->i_mutex); - ext4_flush_completed_IO(inode); - mutex_unlock(&inode->i_mutex); + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + atomic_inc(&inode->i_dio_count); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) { + inode_dio_done(inode); + goto locked; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, offset, ext4_get_block, NULL, NULL, 0); + inode_dio_done(inode); } else { - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); +locked: + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext4_truncate_failed_write(inode); @@ -835,7 +715,7 @@ retry: int err; /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data * but cannot extend i_size. Bail out and pretend @@ -897,27 +777,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } -int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block */ - indirects = nrblocks * 2 + 1; - return indirects; + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* @@ -925,25 +796,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If + * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. @@ -1074,11 +929,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { @@ -1336,68 +1193,31 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } -void ext4_ind_truncate(struct inode *inode) +void ext4_ind_truncate(handle_t *handle, struct inode *inode) { - handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; - loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) - goto out_stop; /* error */ + return; } - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which @@ -1413,7 +1233,7 @@ void ext4_ind_truncate(struct inode *inode) * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ - goto out_unlock; + return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -1473,30 +1293,99 @@ do_indirects: case EXT4_TIND_BLOCK: ; } +} -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); +static int free_hole_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, __le32 *i_data, + int level, ext4_lblk_t first, + ext4_lblk_t count, int max) +{ + struct buffer_head *bh = NULL; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ret = 0; + int i, inc; + ext4_lblk_t offset; + __le32 blk; - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); + inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); + for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { + if (offset >= count + first) + break; + if (*i_data == 0 || (offset + inc) <= first) + continue; + blk = *i_data; + if (level > 0) { + ext4_lblk_t first2; + ext4_lblk_t count2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), + "Read failure"); + return -EIO; + } + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } + ret = free_hole_blocks(handle, inode, bh, + (__le32 *)bh->b_data, level - 1, + first2, count2, + inode->i_sb->s_blocksize >> 2); + if (ret) { + brelse(bh); + goto err; + } + } + if (level == 0 || + (bh && all_zeroes((__le32 *)bh->b_data, + (__le32 *)bh->b_data + addr_per_block))) { + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); + } + brelse(bh); + bh = NULL; + } + +err: + return ret; +} + +int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) +{ + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int level, ret = 0; + int num = EXT4_NDIR_BLOCKS; + ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; + __le32 *i_data = EXT4_I(inode)->i_data; + + count = stop - first; + for (level = 0; level < 4; level++, max *= addr_per_block) { + if (first < max) { + ret = free_hole_blocks(handle, inode, NULL, i_data, + level, first, count, num); + if (ret) + goto err; + if (count > max - first) + count -= max - first; + else + break; + first = 0; + } else { + first -= max; + } + i_data += num; + if (level == 0) { + num = 1; + max = 1; + } + } - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); +err: + return ret; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c new file mode 100644 index 00000000000..645205d8ada --- /dev/null +++ b/fs/ext4/inline.c @@ -0,0 +1,2000 @@ +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma <boyu.mt@taobao.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "truncate.h" +#include <linux/fiemap.h> + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + int free, min_offs; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_block && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? + len : EXT4_MIN_INLINE_DATA_SIZE; + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_inline_set. + * That saves us one memcpy. + */ +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = EXT4_ZERO_XATTR_VALUE; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(!is.s.not_found); + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(is.s.not_found); + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) + goto out; + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error == -ENODATA) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + /* Update the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int ret, size; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + down_write(&EXT4_I(inode)->xattr_sem); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUFFER_TRACE(is.iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +static int ext4_read_inline_page(struct inode *inode, struct page *page) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!PageLocked(page)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(page->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + kaddr = kmap_atomic(page); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + flush_dcache_page(page); + kunmap_atomic(kaddr); + zero_user_segment(page, len, PAGE_CACHE_SIZE); + SetPageUptodate(page); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!page->index) + ret = ext4_read_inline_page(inode, page); + else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + unlock_page(page); + return ret >= 0 ? 0 : ret; +} + +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags) +{ + int ret, needed_blocks; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct page *page = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, from, to, ext4_get_block_write); + else + ret = __block_write_begin(page, from, to, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + ext4_orphan_add(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + block_commit_write(page, from, to); +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (sem_held) + up_write(&EXT4_I(inode)->xattr_sem); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + int ret; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + if (pos + len > ext4_get_max_inline_size(inode)) + goto convert; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + /* + * The possible write could happen in the inode, + * so try to reserve the space in inode first. + */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + + /* We don't have space in inline inode, so convert it to extent. */ + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + brelse(iloc.bh); + goto convert; + } + + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + *pagep = page; + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + unlock_page(page); + page_cache_release(page); + goto out_up_read; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_up_read; + } + + ret = 1; + handle = NULL; +out_up_read: + up_read(&EXT4_I(inode)->xattr_sem); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +convert: + return ext4_convert_inline_data_to_extent(mapping, + inode, flags); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + if (unlikely(copied < len)) { + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + copied = 0; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + up_write(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); +out: + return copied; +} + +struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + return NULL; + } + + down_write(&EXT4_I(inode)->xattr_sem); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); + kunmap_atomic(kaddr); + up_write(&EXT4_I(inode)->xattr_sem); + + return iloc.bh; +} + +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metatdata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags, + void **fsdata) +{ + int ret = 0, inline_size; + struct page *page; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = __block_write_begin(page, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + ext4_truncate_failed_write(inode); + goto out; + } + + SetPageDirty(page); + SetPageUptodate(page); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (page) { + unlock_page(page); + page_cache_release(page); + } + return ret; +} + +/* + * Prepare the write for the inline data. + * If the the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret, inline_size; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + int retries; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + inline_size = ext4_get_max_inline_size(inode); + + ret = -ENOSPC; + if (inline_size >= pos + len) { + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out_journal; + } + + if (ret == -ENOSPC) { + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out; + } + + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out_journal; + } + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_page; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_release_page; + } + + up_read(&EXT4_I(inode)->xattr_sem); + *pagep = page; + brelse(iloc.bh); + return 1; +out_release_page: + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); +out_journal: + ext4_journal_stop(handle); +out: + brelse(iloc.bh); + return ret; +} + +int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + int i_size_changed = 0; + + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} + +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct dentry *dentry, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + int err; + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, + inline_start, inline_size, + name, namelen, &de); + if (err) + return err; + + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; + ext4_insert_dentry(inode, de, inline_size, name, namelen); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = (struct ext4_dir_entry_2 *)de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = (struct ext4_dir_entry_2 *)de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + ext4_create_inline_data(handle, inode, inline_size); + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_finish_convert_inline_dir(handle_t *handle, + struct inode *inode, + struct buffer_head *dir_block, + void *buf, + int inline_size) +{ + int err, csum_size = 0, header_size = 0; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + void *target = dir_block->b_data; + + /* + * First create "." and ".." and then copy the dir information + * back to the block. + */ + de = (struct ext4_dir_entry_2 *)target; + de = ext4_init_dot_dotdot(inode, de, + inode->i_sb->s_blocksize, csum_size, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); + header_size = (void *)de - target; + + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + ext4_update_final_de(dir_block->b_data, + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, + inode->i_sb->s_blocksize - csum_size); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, + inode->i_sb->s_blocksize); + initialize_dirent_tail(t, inode->i_sb->s_blocksize); + } + set_buffer_uptodate(dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + return err; +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -ENOMEM; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, data_bh); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, + buf, inline_size); + } + + unlock_buffer(data_bh); +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + int ret, inline_size; + void *inline_start; + struct ext4_iloc iloc; + struct inode *dir = dentry->d_parent->d_inode; + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_mark_inode_dirty(handle, dir); + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ +int ext4_read_inline_dir(struct file *file, + struct dir_context *ctx, + int *has_inline_data) +{ + unsigned int offset, parent_ino; + int i; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = file_inode(file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + ret = 0; + sb = inode->i_sb; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = ctx->pos; + + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (file->f_version != inode->i_version) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ + if (!i) { + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; + continue; + } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ + de = (struct ext4_dir_entry_2 *) + (dir_buf + i - extra_offset); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) + < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); + } + offset = i; + ctx->pos = offset; + file->f_version = inode->i_version; + } + + while (ctx->pos < extra_size) { + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_offset; + continue; + } + + if (ctx->pos == dotdot_offset) { + if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_size; + continue; + } + + de = (struct ext4_dir_entry_2 *) + (dir_buf + ctx->pos - extra_offset); + if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, + extra_size, ctx->pos)) + goto out; + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto out; + } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + int ret; + struct ext4_iloc iloc; + void *inline_start; + int inline_size; + + if (ext4_get_inode_loc(dir, &iloc)) + return NULL; + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(iloc.bh); + iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return iloc.bh; +} + +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_mark_inode_dirty(handle, dir); + if (unlikely(err)) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + int ret = 1; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", + err, dir->i_ino); + return 1; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + ret = 1; + goto out; + } + + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < dir->i_size) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d\n", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + ret = 1; + goto out; + } + if (le32_to_cpu(de->inode)) { + ret = 0; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret; + + down_write(&EXT4_I(inode)->xattr_sem); + ret = ext4_destroy_inline_data_nolock(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + int error = 0; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + goto out; + } + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + physical += offsetof(struct ext4_inode, i_block); + length = i_size_read(inode); + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + brelse(iloc.bh); +out: + up_read(&EXT4_I(inode)->xattr_sem); + return (error < 0 ? error : 0); +} + +/* + * Called during xattr set, and if we can sparse space 'needed', + * just create the extent tree evict the data to the outer block. + * + * We use jbd2 instead of page cache to move data to the 1st block + * so that the whole transaction can be committed as a whole and + * the data isn't lost because of the delayed page cache write. + */ +int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed) +{ + int error; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + raw_inode = ext4_raw_inode(&iloc); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + if (EXT4_XATTR_LEN(entry->e_name_len) + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { + error = -ENOSPC; + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +out: + brelse(iloc.bh); + return error; +} + +void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); + if (IS_ERR(handle)) + return; + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + ext4_journal_stop(handle); + return; + } + + if (ext4_orphan_add(handle, inode)) + goto out; + + if (ext4_get_inode_loc(inode, &is.iloc)) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if (ext4_xattr_ibody_find(inode, &i, &is)) + goto out_error; + + BUG_ON(is.s.not_found); + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) + goto out_error; + + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, value_len)) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + up_write(&EXT4_I(inode)->xattr_sem); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); + return; +} + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_write(&EXT4_I(inode)->xattr_sem); + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + up_write(&EXT4_I(inode)->xattr_sem); +out: + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index feaa82fe629..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,8 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> +#include <linux/bitops.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -47,6 +49,73 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u16 csum_lo; + __u16 csum_hi = 0; + __u32 csum; + + csum_lo = le16_to_cpu(raw->i_checksum_lo); + raw->i_checksum_lo = 0; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum_hi = le16_to_cpu(raw->i_checksum_hi); + raw->i_checksum_hi = 0; + } + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, + EXT4_INODE_SIZE(inode->i_sb)); + + raw->i_checksum_lo = cpu_to_le16(csum_lo); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum_hi); + + return csum; +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 provided, calculated; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(raw->i_checksum_lo); + calculated = ext4_inode_csum(inode, raw, ei); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; + else + calculated &= 0xFFFF; + + return provided == calculated; +} + +static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_inode_csum(inode, raw, ei); + raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum >> 16); +} + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -64,24 +133,23 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); /* * Test whether an inode is a fast symlink. */ static int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -122,8 +190,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -144,15 +210,17 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); + jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -161,12 +229,19 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it + */ + sb_start_intwrite(inode->i_sb); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -175,6 +250,7 @@ void ext4_evict_inode(struct inode *inode) * cleaned up. */ ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } @@ -206,6 +282,7 @@ void ext4_evict_inode(struct inode *inode) stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } } @@ -234,6 +311,7 @@ void ext4_evict_inode(struct inode *inode) else ext4_free_inode(handle, inode); ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -271,14 +349,25 @@ void ext4_da_update_reserve_space(struct inode *inode, spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " - "with only %d reserved data blocks\n", + ext4_warning(inode->i_sb, "%s: ino %lu, used %d " + "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } + if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { + ext4_warning(inode->i_sb, "ino %lu, allocated %d " + "with only %d reserved metadata blocks " + "(releasing %d blocks with reserved %d data blocks)", + inode->i_ino, ei->i_allocated_meta_blocks, + ei->i_reserved_meta_blocks, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; + } + /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; @@ -339,108 +428,57 @@ static int __check_block_validity(struct inode *inode, const char *func, #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, - unsigned int max_pages) +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) { - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - struct pagevec pvec; - pgoff_t num = 0; - int i, nr_pages, done = 0; - - if (max_pages == 0) - return 0; - pagevec_init(&pvec, 0); - while (!done) { - index = idx; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; + int retval; - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page) || - page->index != idx) { - done = 1; - unlock_page(page); - break; - } - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) - done = 1; - bh = bh->b_this_page; - } while (!done && (bh != head)); - } - unlock_page(page); - if (done) - break; - idx++; - num++; - if (num >= max_pages) { - done = 1; - break; - } - } - pagevec_release(&pvec); + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } - return num; -} - -/* - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. - */ -static void set_buffers_da_mapped(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - int i, nr_pages; - pgoff_t index, end; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + map->m_len - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, - min(end - index + 1, - (pgoff_t)PAGEVEC_SIZE)); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - if (unlikely(page->mapping != mapping) || - !PageDirty(page)) - break; + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - set_buffer_da_mapped(bh); - bh = bh->b_this_page; - } while (bh != head); - } - index++; - } - pagevec_release(&pvec); + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertion failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); } } +#endif /* ES_AGGRESSIVE_TEST */ /* * The ext4_map_blocks() function tries to look up the requested blocks, @@ -454,8 +492,8 @@ static void set_buffers_da_mapped(struct inode *inode, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -467,17 +505,60 @@ static void set_buffers_da_mapped(struct inode *inode, int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct extent_status es; int retval; + int ret = 0; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lru_add(inode); + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + retval = 0; + } else { + BUG_ON(1); + } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif + goto found; + } + /* * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -485,10 +566,34 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } - up_read((&EXT4_I(inode)->i_data_sem)); + if (retval > 0) { + unsigned int status; + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, + map->m_len, map->m_pblk, status); + if (ret < 0) + retval = ret; + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + +found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -505,27 +610,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* - * When we call get_blocks without the create flag, the - * BH_Unwritten flag could have gotten set if the blocks - * requested were part of a uninitialized extent. We need to - * clear this flag now that we are committed to convert all or - * part of the uninitialized extent to be an initialized - * extent. This is because we need to avoid the combination - * of BH_Unwritten and BH_Mapped flags being simultaneously - * set on the buffer_head. + * Here we clear m_flags because after allocating an new extent, + * it will be set again. */ - map->m_flags &= ~EXT4_MAP_UNWRITTEN; + map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. */ - down_write((&EXT4_I(inode)->i_data_sem)); + down_write(&EXT4_I(inode)->i_data_sem); /* * if the caller is from delayed allocation writeout path @@ -563,20 +668,45 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* If we have successfully mapped the delayed allocated blocks, - * set the BH_Da_Mapped bit on them. Its important to do this - * under the protection of i_data_sem. + if (retval > 0) { + unsigned int status; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - set_buffers_da_mapped(inode, map); + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret < 0) + retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -594,15 +724,19 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, int ret = 0, started = 0; int dio_credits; + if (ext4_has_inline_data(inode)) + return -ERANGE; + map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, dio_credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); return ret; @@ -612,8 +746,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } @@ -646,15 +784,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); + /* ensure we send some value back into *errp */ + *errp = 0; + + if (create && err == 0) + err = -ENOSPC; /* should never happen */ if (err < 0) *errp = err; if (err <= 0) return NULL; - *errp = 0; bh = sb_getblk(inode->i_sb, map.m_pblk); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; return NULL; } if (map.m_flags & EXT4_MAP_NEW) { @@ -710,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -748,11 +890,10 @@ static int walk_page_buffers(handle_t *handle, * and the commit_write(). So doing the jbd2_journal_start at the start of * prepare_write() is the right place. * - * Also, this function can nest inside ext4_writepage() -> - * block_write_full_page(). In that case, we *know* that ext4_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. + * Also, this function can nest inside ext4_writepage(). In that case, we + * *know* that ext4_writepage() has generated enough buffer credits to do the + * whole page. So we won't block on the journal in that case, which is good, + * because the caller may be PF_MEMALLOC. * * By accident, ext4 can be reentered when a transaction is open via * quota file writes. If we were to commit the transaction while thus @@ -766,8 +907,8 @@ static int walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; @@ -784,13 +925,14 @@ static int do_journal_get_write_access(handle_t *handle, */ if (dirty) clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, bh); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); return ret; } -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -814,24 +956,45 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; -retry: - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + flags, pagep); + if (ret < 0) + return ret; + if (ret == 1) + return 0; } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + if (!page) + return -ENOMEM; + unlock_page(page); + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) { + page_cache_release(page); + return PTR_ERR(handle); + } + + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - *pagep = page; + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -839,13 +1002,13 @@ retry: ret = __block_write_begin(page, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); } if (ret) { unlock_page(page); - page_cache_release(page); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -869,37 +1032,70 @@ retry: if (inode->i_nlink) ext4_orphan_del(NULL, inode); } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + page_cache_release(page); + return ret; + } + *pagep = page; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { + int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; } -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int i_size_changed = 0; - struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int i_size_changed = 0; - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } + + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) + goto errout; + copied = ret; + } else + copied = block_write_end(file, mapping, pos, + len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. + * cannot change under us because we hole i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -909,10 +1105,10 @@ static int ext4_generic_write_end(struct file *file, i_size_changed = 1; } - if (pos + copied > EXT4_I(inode)->i_disksize) { + if (pos + copied > EXT4_I(inode)->i_disksize) { /* We need to mark inode dirty even if * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) + * but greater than i_disksize. (hint delalloc) */ ext4_update_i_disksize(inode, (pos + copied)); i_size_changed = 1; @@ -929,87 +1125,13 @@ static int ext4_generic_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); - } - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1046,16 +1168,21 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else { + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + } new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); @@ -1095,28 +1222,60 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single cluster located at lblock + * Reserve a metadata for a single block located at lblock */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; - int ret; + ext4_lblk_t save_last_lblock; + int save_len; /* * recalculate the amount of metadata blocks to reserve * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; md_needed = EXT4_NUM_B2C(sbi, ext4_calc_metadata_amount(inode, lblock)); trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; spin_unlock(&ei->i_block_reservation_lock); + return 0; /* success */ +} + +/* + * Reserve a single cluster located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + int ret; + ext4_lblk_t save_last_lblock; + int save_len; + /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by @@ -1125,19 +1284,34 @@ repeat: ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + /* * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); - goto repeat; - } return -ENOSPC; } - spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks++; ei->i_reserved_meta_blocks += md_needed; spin_unlock(&ei->i_block_reservation_lock); @@ -1163,9 +1337,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * function is called from invalidate page, it's * harmless to return without any action. */ - ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " - "data blocks\n", inode->i_ino, to_free, + "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; @@ -1195,37 +1369,48 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int stop = offset + length; int num_clusters; + ext4_fsblk_t lblk; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; + if (next_off > stop) + break; + if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); - clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); + if (to_release) { + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, lblk, to_release); + } + /* If we have released all the blocks belonging to a cluster, then we * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { - ext4_fsblk_t lblk; lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk, 1)) + !ext4_find_delalloc_cluster(inode, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1236,166 +1421,24 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct pagevec pvec; - unsigned long index, end; - int ret = 0, err, nr_pages, i; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - loff_t size = i_size_read(inode); - unsigned int len, block_start; - struct buffer_head *bh, *page_bufs = NULL; - int journal_data = ext4_should_journal_data(inode); - sector_t pblock = 0, cur_logical = 0; - struct ext4_io_submit io_submit; +struct mpage_da_data { + struct inode *inode; + struct writeback_control *wbc; - BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + pgoff_t first_page; /* The first page to write */ + pgoff_t next_page; /* Current page to examine */ + pgoff_t last_page; /* Last page to examine */ /* - * We need to start from the first_page to the next_page - 1 - * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking - * at the currently mapped buffer_heads. + * Extent to map - this can be after first_page because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. */ - index = mpd->first_page; - end = mpd->next_page - 1; - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - int commit_write = 0, skip_page = 0; - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - - if (index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - if (map) { - cur_logical = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - pblock = map->m_pblk + (cur_logical - - map->m_lblk); - } - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - /* - * If the page does not have buffers (for - * whatever reason), try to create them using - * __block_write_begin. If this fails, - * skip the page and move on. - */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - skip_page: - unlock_page(page); - continue; - } - commit_write = 1; - } - - bh = page_bufs = page_buffers(page); - block_start = 0; - do { - if (!bh) - goto skip_page; - if (map && (cur_logical >= map->m_lblk) && - (cur_logical <= (map->m_lblk + - (map->m_len - 1)))) { - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } - if (buffer_da_mapped(bh)) - clear_buffer_da_mapped(bh); - if (buffer_unwritten(bh) || - buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - clear_buffer_unwritten(bh); - } - - /* - * skip page if block allocation undone and - * block is dirty - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) - skip_page = 1; - bh = bh->b_this_page; - block_start += bh->b_size; - cur_logical++; - pblock++; - } while (bh != page_bufs); - - if (skip_page) - goto skip_page; - - if (commit_write) - /* mark the buffer_heads as dirty & uptodate */ - block_commit_write(page, 0, len); - - clear_page_dirty_for_io(page); - /* - * Delalloc doesn't support data journalling, - * but eventually maybe we'll lift this - * restriction. - */ - if (unlikely(journal_data && PageChecked(page))) - err = __ext4_journalled_writepage(page, len); - else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) - err = ext4_bio_write_page(&io_submit, page, - len, mpd->wbc); - else if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - err = block_write_full_page_endio(page, - noalloc_get_block_write, - mpd->wbc, ext4_end_io_buffer_write); - } else - err = block_write_full_page(page, - noalloc_get_block_write, mpd->wbc); - - if (!err) - mpd->pages_written++; - /* - * In error case, we have to continue because - * remaining pages are still locked - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - ext4_io_submit(&io_submit); - return ret; -} + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ +}; -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) { int nr_pages, i; pgoff_t index, end; @@ -1403,8 +1446,20 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + /* This is necessary when next_page == 0. */ + if (mpd->first_page >= mpd->next_page) + return; + index = mpd->first_page; end = mpd->next_page - 1; + if (invalidate) { + ext4_lblk_t start, last; + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + } + + pagevec_init(&pvec, 0); while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -1415,253 +1470,40 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) break; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - block_invalidatepage(page, 0); - ClearPageUptodate(page); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } unlock_page(page); } index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } - return; } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_CRIT "Total free blocks count %lld\n", + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); - printk(KERN_CRIT "Free/Dirty block details\n"); - printk(KERN_CRIT "free_blocks=%lld\n", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(sb))); + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); - printk(KERN_CRIT "dirty_blocks=%lld\n", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); - printk(KERN_CRIT "Block reservation details\n"); - printk(KERN_CRIT "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", - EXT4_I(inode)->i_reserved_meta_blocks); - return; -} - -/* - * mpage_da_map_and_submit - go through given space, map them - * if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ - int err, blks, get_blocks_flags; - struct ext4_map_blocks map, *mapp = NULL; - sector_t next = mpd->b_blocknr; - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; - handle_t *handle = NULL; - - /* - * If the blocks are mapped already, or we couldn't accumulate - * any blocks, then proceed immediately to the submission stage. - */ - if ((mpd->b_size == 0) || - ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten)))) - goto submit_io; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - - /* - * Call ext4_map_blocks() to allocate any delayed allocation - * blocks, or to convert an uninitialized extent to be - * initialized (in the case where we have written into - * one or more preallocated blocks). - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to - * indicate that we are on the delayed allocation path. This - * affects functions in many different parts of the allocation - * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_map_blocks() - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the - * inode's allocation semaphore is taken. - * - * If the blocks in questions were delalloc blocks, set - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting - * variables are updated after the blocks have been allocated. - */ - map.m_lblk = next; - map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; - if (ext4_should_dioread_nolock(mpd->inode)) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); - if (blks < 0) { - struct super_block *sb = mpd->inode->i_sb; - - err = blks; - /* - * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will just let - * mpage_da_submit_io() unlock all of the pages. - */ - if (err == -EAGAIN) - goto submit_io; - - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { - mpd->retval = err; - goto submit_io; - } - - /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. - */ - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { - ext4_msg(sb, KERN_CRIT, - "delayed block allocation failed for inode %lu " - "at logical offset %llu with max blocks %zd " - "with error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost\n"); - if (err == -ENOSPC) - ext4_print_free_blocks(mpd->inode); - } - /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd); - - /* Mark this page range as having been completed */ - mpd->io_done = 1; - return; - } - BUG_ON(blks == 0); - - mapp = ↦ - if (map.m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = mpd->inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map.m_len; i++) - unmap_underlying_metadata(bdev, map.m_pblk + i); - - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) { - /* Only if the journal is aborted */ - mpd->retval = err; - goto submit_io; - } - } - } - - /* - * Update on-disk size along with block allocation. - */ - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; - if (disksize > i_size_read(mpd->inode)) - disksize = i_size_read(mpd->inode); - if (disksize > EXT4_I(mpd->inode)->i_disksize) { - ext4_update_i_disksize(mpd->inode, disksize); - err = ext4_mark_inode_dirty(handle, mpd->inode); - if (err) - ext4_error(mpd->inode->i_sb, - "Failed to mark inode %lu dirty", - mpd->inode->i_ino); - } - -submit_io: - mpage_da_submit_io(mpd, mapp); - mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ - (1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, size_t b_size, - unsigned long b_state) -{ - sector_t next; - int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; - - /* - * XXX Don't go larger than mballoc is willing to allocate - * This is a stopgap solution. We eventually need to fold - * mpage_da_submit_io() into this function and then call - * ext4_map_blocks() multiple times in a loop - */ - if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) - goto flush_it; - - /* check if thereserved journal credits might overflow */ - if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { - if (nrblocks >= EXT4_MAX_TRANS_DATA) { - /* - * With non-extent format we are limited by the journal - * credit available. Total credit needed to insert - * nrblocks contiguous blocks is dependent on the - * nrblocks. So limit nrblocks. - */ - goto flush_it; - } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > - EXT4_MAX_TRANS_DATA) { - /* - * Adding the new buffer_head would make it cross the - * allowed limit for which we have journal credit - * reserved. So limit the new bh->b_size - */ - b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << - mpd->inode->i_blkbits; - /* we will do mpage_da_submit_io in the next loop */ - } - } - /* - * First block in the extent - */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = b_size; - mpd->b_state = b_state & BH_FLAGS; - return; - } - - next = mpd->b_blocknr + nrblocks; - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += b_size; - return; - } - -flush_it: - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_and_submit(mpd); + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + ei->i_reserved_data_blocks); + ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", + ei->i_reserved_meta_blocks); + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", + ei->i_allocated_meta_blocks); return; } @@ -1680,8 +1522,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { + struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1690,28 +1538,102 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { + ext4_es_lru_add(inode); + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read(&EXT4_I(inode)->i_data_sem); + goto add_delayed; + } + + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG_ON(1); + +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif + return retval; + } + /* * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); + down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_has_inline_data(inode)) { + /* + * We will soon create blocks for this page, and let + * us pretend as if the blocks aren't allocated yet. + * In case of clusters, we have to handle the work + * of mapping from cluster so that the reserved space + * is calculated properly. + */ + if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + retval = 0; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_ind_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); +add_delayed: if (retval == 0) { + int ret; /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { - retval = ext4_da_reserve_space(inode, iblock); - if (retval) + ret = ext4_da_reserve_space(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { /* not enough space to reserve */ + retval = ret; goto out_unlock; + } + } + + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + ~0, EXTENT_STATUS_DELAYED); + if (ret) { + retval = ret; + goto out_unlock; } /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served @@ -1722,6 +1644,24 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); + } else if (retval > 0) { + int ret; + unsigned int status; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret != 0) + retval = ret; } out_unlock: @@ -1742,8 +1682,8 @@ out_unlock: * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; @@ -1779,27 +1719,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -/* - * This function is used as a standard get_block_t calback function - * when there is no desire to allocate any blocks. It is used as a - * callback function for block_write_begin() and block_write_full_page(). - * These functions should only try to map a single block at a time. - * - * Since this function doesn't do block allocations even if the caller - * requests it by passing in create=1, it is critically important that - * any caller checks to make sure that any buffer heads are returned - * by this function are either all already mapped or marked for - * delayed allocation before calling block_write_full_page(). Otherwise, - * b_blocknr could be left unitialized, and the page write functions will - * be taken by surprise. - */ -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - return _ext4_get_block(inode, iblock, bh_result, 0); -} - static int bget_one(handle_t *handle, struct buffer_head *bh) { get_bh(bh); @@ -1817,20 +1736,35 @@ static int __ext4_journalled_writepage(struct page *page, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; - int ret = 0; - int err; + int ret = 0, err = 0; + int inline_data = ext4_has_inline_data(inode); + struct buffer_head *inode_bh = NULL; ClearPageChecked(page); - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + + if (inline_data) { + BUG_ON(page->index != 0); + BUG_ON(len > ext4_get_max_inline_size(inode)); + inode_bh = ext4_journalled_write_inline_data(inode, len, page); + if (inode_bh == NULL) + goto out; + } else { + page_bufs = page_buffers(page); + if (!page_bufs) { + BUG(); + goto out; + } + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bget_one); + } /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1838,11 +1772,19 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); + ret = ext4_journal_get_write_access(handle, inode_bh); + + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); + + } else { + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + } if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1850,15 +1792,15 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + if (!ext4_has_inline_data(inode)) + ext4_walk_page_buffers(NULL, page_bufs, 0, len, + NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + brelse(inode_bh); return ret; } -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); - /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -1870,9 +1812,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); * lock so we have to do some magic. * * This function can get called via... - * - ext4_da_writepages after taking page lock (have journal handle) + * - ext4_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via pdflush (no journal handle) + * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) * * We don't do any block allocation in this function. If we have page with @@ -1903,11 +1845,13 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0, commit_write = 0; + int ret = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; + struct ext4_io_submit io_submit; + bool keep_towrite = false; trace_ext4_writepage(page); size = i_size_read(inode); @@ -1916,39 +1860,30 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; + page_bufs = page_buffers(page); /* - * If the page does not have buffers (for whatever reason), - * try to create them using __block_write_begin. If this - * fails, redirty the page and move on. + * We cannot do block allocation or other extent handling in this + * function. If there are buffers needing that, we have to redirty + * the page. But we may reach here when we do a journal commit via + * journal_submit_inode_data_buffers() and in that case we must write + * allocated buffers to achieve data=ordered mode guarantees. */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(wbc, page); + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + redirty_page_for_writepage(wbc, page); + if (current->flags & PF_MEMALLOC) { + /* + * For memory cleaning there's no point in writing only + * some buffers. So just bail out. Warn if we came here + * from direct reclaim. + */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) + == PF_MEMALLOC); unlock_page(page); return 0; } - commit_write = 1; - } - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation, so redirty - * the page and return. We may reach here when we do - * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list but it - * should never be for direct reclaim so warn if that - * happens - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC); - goto redirty_page; + keep_towrite = true; } - if (commit_write) - /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, len); if (PageChecked(page) && ext4_should_journal_data(inode)) /* @@ -1957,80 +1892,456 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - ret = block_write_full_page_endio(page, noalloc_get_block_write, - wbc, ext4_end_io_buffer_write); - } else - ret = block_write_full_page(page, noalloc_get_block_write, - wbc); - + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return -ENOMEM; + } + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); + ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) + /* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks up to full group size. */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 -static int ext4_da_writepages_trans_blocks(struct inode *inode) +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @bh - buffer head we want to add to the extent + * + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. + */ +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } + + /* First block in the extent? */ + if (map->m_len == 0) { + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; + } + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (bh->b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return true; + } + return false; +} + +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + int err; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + + do { + BUG_ON(buffer_locked(bh)); + + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { + /* Found extent to map? */ + if (mpd->map.m_len) + return 0; + /* Everything mapped so far and we hit EOF */ + break; + } + } while (lblk++, (bh = bh->b_this_page) != head); + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to unwritten extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + struct pagevec pvec; + int nr_pages, i; + struct inode *inode = mpd->inode; + struct buffer_head *head, *bh; + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + sector_t pblock; + int err; + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + lblk = start << bpp_bits; + pblock = mpd->map.m_pblk; + + pagevec_init(&pvec, 0); + while (start <= end) { + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + /* Up to 'end' pages must be contiguous */ + BUG_ON(page->index != start); + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); + pagevec_release(&pvec); + if (err > 0) + err = 0; + return err; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (lblk++, (bh = bh->b_this_page) != head); + + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + /* Page fully mapped - let IO run! */ + err = mpage_submit_page(mpd, page); + if (err < 0) { + pagevec_release(&pvec); + return err; + } + start++; + } + pagevec_release(&pvec); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err, dioread_nolock; + + trace_ext4_da_write_pages_extent(inode, map); /* - * With non-extent format the journal credit needed to - * insert nrblocks contiguous block is dependent on - * number of contiguous block. So we will limit - * number of contiguous block to a sane value + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an unwritten extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks + * in question are delalloc blocks. This affects functions in many + * different parts of the allocation call path. This flag exists + * primarily because we don't want to change *many* call functions, so + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag + * once the inode's allocation semaphore is taken. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - (max_blocks > EXT4_MAX_TRANS_DATA)) - max_blocks = EXT4_MAX_TRANS_DATA; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (map->m_flags & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + } - return ext4_chunk_trans_blocks(inode, max_blocks); + BUG_ON(map->m_len == 0); + if (map->m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map->m_len; i++) + unmap_underlying_metadata(bdev, map->m_pblk + i); + } + return 0; } /* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + * is no hope of writing the data. The caller should discard + * dirty pages to avoid infinite loops. + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. */ -static int write_cache_pages_da(struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd, - pgoff_t *done_index) +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) { - struct buffer_head *bh, *head; - struct inode *inode = mapping->host; - struct pagevec pvec; - unsigned int nr_pages; - sector_t logical; - pgoff_t index, end; - long nr_to_write = wbc->nr_to_write; - int i, tag, ret = 0; - - memset(mpd, 0, sizeof(struct mpage_da_data)); - mpd->wbc = wbc; - mpd->inode = inode; - pagevec_init(&pvec, 0); - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + mpd->io_submit.io_end->offset = + ((loff_t)map->m_lblk) << inode->i_blkbits; + do { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) + return err; + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + return err; + } while (map->m_len); + + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + if (disksize > EXT4_I(inode)->i_disksize) { + int err2; + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (err2) + ext4_error(inode->i_sb, + "Failed to mark inode %lu dirty", + inode->i_ino); + if (!err) + err = err2; + } + return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + + return ext4_meta_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; + long left = mpd->wbc->nr_to_write; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + int tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - *done_index = index; + pagevec_init(&pvec, 0); + mpd->map.m_len = 0; + mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - return 0; + goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2045,31 +2356,32 @@ static int write_cache_pages_da(struct address_space *mapping, if (page->index > end) goto out; - *done_index = page->index + 1; - /* - * If we can't merge this page, and we have - * accumulated an contiguous region, write it + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. */ - if ((mpd->next_page != page->index) && - (mpd->next_page != mpd->first_page)) { - mpage_da_map_and_submit(mpd); - goto ret_extent_tail; - } + if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + goto out; - lock_page(page); + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; + lock_page(page); /* - * If the page is no longer dirty, or its - * mapping no longer corresponds to inode we - * are writing (which means it has been - * truncated or invalidated), or the page is - * already under writeback and we are not - * doing a data integrity writeback, skip the page + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page */ if (!PageDirty(page) || (PageWriteback(page) && - (wbc->sync_mode == WB_SYNC_NONE)) || + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(page->mapping != mapping)) { unlock_page(page); continue; @@ -2078,108 +2390,54 @@ static int write_cache_pages_da(struct address_space *mapping, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - if (mpd->next_page != page->index) + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, - PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - goto ret_extent_tail; - } else { - /* - * Page with regular buffer heads, - * just add all dirty ones - */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need - * to update the b_state - * because we look at b_state - * in mpage_da_map_blocks. We - * don't update b_size because - * if we find an unmapped - * buffer_head later we need to - * use the b_state flag of that - * buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - goto out; - } + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_CACHE_SHIFT - blkbits); + head = page_buffers(page); + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) + goto out; + err = 0; + left--; } pagevec_release(&pvec); cond_resched(); } return 0; -ret_extent_tail: - ret = MPAGE_DA_EXTENT_TAIL; out: pagevec_release(&pvec); - cond_resched(); - return ret; + return err; } +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = ext4_writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - pgoff_t index; + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; int range_whole = 0; + int cycled = 1; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int pages_written = 0; - unsigned int max_pages; - int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0; - long desired_nr_to_write, nr_to_writebump = 0; - loff_t range_start = wbc->range_start; + int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - pgoff_t done_index = 0; - pgoff_t end; + bool done; struct blk_plug plug; + bool give_up_on_write = false; - trace_ext4_da_writepages(inode, wbc); + trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2187,7 +2445,16 @@ static int ext4_da_writepages(struct address_space *mapping, * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; + goto out_writepages; + + if (ext4_should_journal_data(inode)) { + struct blk_plug plug; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + goto out_writepages; + } /* * If the filesystem has aborted, it is read-only, so return @@ -2195,163 +2462,157 @@ static int ext4_da_writepages(struct address_space *mapping, * will obscure the real source of the problem. We test * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because * the latter could be true if the filesystem is mounted - * read-only, and in that case, ext4_da_writepages should + * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) - return -EROFS; + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ret = -EROFS; + goto out_writepages; + } + + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + } + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - range_cyclic = wbc->range_cyclic; if (wbc->range_cyclic) { - index = mapping->writeback_index; - if (index) + writeback_index = mapping->writeback_index; + if (writeback_index) cycled = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = LLONG_MAX; - wbc->range_cyclic = 0; - end = -1; + mpd.first_page = writeback_index; + mpd.last_page = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - /* - * This works around two forms of stupidity. The first is in - * the writeback code, which caps the maximum number of pages - * written to be 1024 pages. This is wrong on multiple - * levels; different architectues have a different page size, - * which changes the maximum amount of data which gets - * written. Secondly, 4 megabytes is way too small. XFS - * forces this value to be 16 megabytes by multiplying - * nr_to_write parameter by four, and then relies on its - * allocator to allocate larger extents to make them - * contiguous. Unfortunately this brings us to the second - * stupidity, which is that ext4's mballoc code only allocates - * at most 2048 blocks. So we force contiguous writes up to - * the number of dirty blocks in the inode, or - * sbi->max_writeback_mb_bump whichever is smaller. - */ - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) { - if (wbc->nr_to_write == LONG_MAX) - desired_nr_to_write = wbc->nr_to_write; - else - desired_nr_to_write = wbc->nr_to_write * 8; - } else - desired_nr_to_write = ext4_num_dirty_pages(inode, index, - max_pages); - if (desired_nr_to_write > max_pages) - desired_nr_to_write = max_pages; - - if (wbc->nr_to_write < desired_nr_to_write) { - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; - wbc->nr_to_write = desired_nr_to_write; + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; } + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + done = false; blk_start_plug(&plug); - while (!ret && wbc->nr_to_write > 0) { + while (!done && mpd.first_page <= mpd.last_page) { + /* For each extent of pages we use new io_end */ + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + break; + } /* - * we insert one extent at a time. So we need - * credit needed for single extent allocation. - * journalled mode is currently not supported - * by delalloc + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); - /* start a new transaction*/ - handle = ext4_journal_start(inode, needed_blocks); + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); - blk_finish_plug(&plug); - goto out_writepages; + /* Release allocated io_end */ + ext4_put_io_end(mpd.io_submit.io_end); + break; } - /* - * Now call write_cache_pages_da() to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4 and submit them. - */ - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); - /* - * If we have a contiguous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - mpage_da_map_and_submit(&mpd); - ret = MPAGE_DA_EXTENT_TAIL; + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); + ret = mpage_prepare_extent_to_map(&mpd); + if (!ret) { + if (mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, + &give_up_on_write); + else { + /* + * We scanned the whole range (or exhausted + * nr_to_write), submitted what was mapped and + * didn't find anything needing mapping. We are + * done. + */ + done = true; + } } - trace_ext4_da_write_pages(inode, &mpd); - wbc->nr_to_write -= mpd.pages_written; - ext4_journal_stop(handle); - - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { - /* commit the transaction which would + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, give_up_on_write); + /* Drop our io_end reference we got from init */ + ext4_put_io_end(mpd.io_submit.io_end); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; - } else if (ret == MPAGE_DA_EXTENT_TAIL) { - /* - * Got one extent now try with rest of the pages. - * If mpd.retval is set -EIO, journal is aborted. - * So we don't need to write any more. - */ - pages_written += mpd.pages_written; - ret = mpd.retval; - io_done = 1; - } else if (wbc->nr_to_write) - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ + continue; + } + /* Fatal error - ENOMEM, EIO... */ + if (ret) break; } blk_finish_plug(&plug); - if (!io_done && !cycled) { + if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - index = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = mapping->writeback_index - 1; + mpd.last_page = writeback_index - 1; + mpd.first_page = 0; goto retry; } /* Update index */ - wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* - * set the writeback_index so that range_cyclic + * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = done_index; + mapping->writeback_index = mpd.first_page; out_writepages: - wbc->nr_to_write -= nr_to_writebump; - wbc->range_start = range_start; - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); return ret; } -#define FALL_BACK_TO_NONDELALLOC 1 static int ext4_nonda_switch(struct super_block *sb) { - s64 free_blocks, dirty_blocks; + s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* @@ -2362,24 +2623,24 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } - /* - * Even if we don't switch but are nearing capacity, - * start pushing delalloc when 1/2 of free blocks are dirty. - */ - if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); - return 0; } @@ -2402,35 +2663,58 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); -retry: + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_da_write_inline_data_begin(mapping, inode, + pos, len, flags, + pagep, fsdata); + if (ret < 0) + return ret; + if (ret == 1) + return 0; + } + + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + /* * With delayed allocation, we don't log the i_disksize update * if there is delayed block allocation. But we still need * to journalling the i_disksize update if writes to the end * of file which has an already mapped buffer. */ - handle = ext4_journal_start(inode, 1); +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + page_cache_release(page); + return PTR_ERR(handle); } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - *pagep = page; + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); - page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -2438,11 +2722,16 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + + page_cache_release(page); + return ret; } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + *pagep = page; return ret; } @@ -2481,17 +2770,9 @@ static int ext4_da_write_end(struct file *file, unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - } else { - BUG(); - } - } + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); @@ -2502,22 +2783,13 @@ static int ext4_da_write_end(struct file *file, * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ - new_i_size = pos + copied; if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { + if (ext4_has_inline_data(inode) || + ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - + if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - } up_write(&EXT4_I(inode)->i_data_sem); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size @@ -2526,8 +2798,16 @@ static int ext4_da_write_end(struct file *file, ext4_mark_inode_dirty(handle, inode); } } - ret2 = generic_write_end(file, mapping, pos, len, copied, + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + page); + else + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = ret2; if (ret2 < 0) ret = ret2; @@ -2538,7 +2818,8 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { /* * Drop reserved blocks @@ -2547,10 +2828,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) if (!page_has_buffers(page)) goto out; - ext4_da_page_release_reservation(page, offset); + ext4_da_page_release_reservation(page, offset, length); out: - ext4_invalidatepage(page, offset); + ext4_invalidatepage(page, offset, length); return; } @@ -2573,7 +2854,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * - * ext4_da_writepages() -> + * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() @@ -2620,6 +2901,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + return 0; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* @@ -2665,58 +2952,67 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + int ret = -EAGAIN; + struct inode *inode = page->mapping->host; + trace_ext4_readpage(page); - return mpage_readpage(page, ext4_get_block); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, page); + + if (ret == -EAGAIN) + return mpage_readpage(page, ext4_get_block); + + return ret; } static int ext4_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct inode *inode = mapping->host; + + /* If the file has inline data, no need to do readpages. */ + if (ext4_has_inline_data(inode)) + return 0; + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - struct buffer_head *head, *bh; - unsigned int curr_off = 0; + trace_ext4_invalidatepage(page, offset, length); - if (!page_has_buffers(page)) - return; - head = bh = page_buffers(page); - do { - if (offset <= curr_off && test_clear_buffer_uninit(bh) - && bh->b_private) { - ext4_free_io_end(bh->b_private); - bh->b_private = NULL; - bh->b_end_io = NULL; - } - curr_off = curr_off + bh->b_size; - bh = bh->b_this_page; - } while (bh != head); + /* No journalling happens on data buffers when this function is used */ + WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + + block_invalidatepage(page, offset, length); } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static int __ext4_journalled_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_invalidatepage(page, offset); + trace_ext4_journalled_invalidatepage(page, offset, length); /* - * free any io_end structure allocated for buffers to be discarded - */ - if (ext4_should_dioread_nolock(page->mapping->host)) - ext4_invalidatepage_free_endio(page, offset); - /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - if (journal) - jbd2_journal_invalidatepage(journal, page, offset); - else - block_invalidatepage(page, offset); + return jbd2_journal_invalidatepage(journal, page, offset, length); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -2725,8 +3021,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); @@ -2739,7 +3035,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. */ -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", @@ -2748,124 +3044,33 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_NO_LOCK); +} + static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) + ssize_t size, void *private) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ext4_io_end_t *io_end = iocb->private; - struct workqueue_struct *wq; - unsigned long flags; - struct ext4_inode_info *ei; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) + return; ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - if (is_async) - aio_complete(iocb, ret, 0); - inode_dio_done(inode); - return; - } - - io_end->offset = offset; - io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - - /* Add the io_end to per-inode completed aio dio list*/ - ei = EXT4_I(io_end->inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &ei->i_completed_io_list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); -} - -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -{ - ext4_io_end_t *io_end = bh->b_private; - struct workqueue_struct *wq; - struct inode *inode; - unsigned long flags; - - if (!test_clear_buffer_uninit(bh) || !io_end) - goto out; - - if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - printk("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - goto out; - } - - /* - * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, - * but being more careful is always safe for the future change. - */ - inode = io_end->inode; - ext4_set_io_unwritten_flag(inode, io_end); - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); -out: - bh->b_private = NULL; - bh->b_end_io = NULL; - clear_buffer_uninit(bh); - end_buffer_async_write(bh, uptodate); -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; - size_t size = bh->b_size; - -retry: - io_end = ext4_init_io_end(inode, GFP_ATOMIC); - if (!io_end) { - pr_warn_ratelimited("%s: allocation fail\n", __func__); - schedule(); - goto retry; - } io_end->offset = offset; io_end->size = size; - /* - * We need to hold a reference to the page to make sure it - * doesn't get evicted before ext4_end_io_work() has a chance - * to convert the extent from written to unwritten. - */ - io_end->page = page; - get_page(io_end->page); - - bh->b_private = io_end; - bh->b_end_io = ext4_end_io_buffer_write; - return 0; + ext4_put_io_end(io_end); } /* @@ -2873,11 +3078,11 @@ retry: * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are splited, but - * still keep the range to write as uninitialized. + * For holes, we fallocate those blocks, mark them as unwritten + * If those blocks were preallocated, we mark sure they are split, but + * still keep the range to write as unwritten. * - * The unwrritten extents will be converted to written when DIO is completed. + * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we * set up an end_io call back function, which will do the conversion * when async direct IO completed. @@ -2888,104 +3093,147 @@ retry: * */ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - size_t count = iov_length(iov, nr_segs); - + size_t count = iov_iter_count(iter); + int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; loff_t final_size = offset + count; - if (rw == WRITE && final_size <= inode->i_size) { - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as uninitialized - * to prevent parallel buffered read to expose the stale data - * before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block - * will just simply mark the buffer mapped but still - * keep the extents uninitialized. - * - * for non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * for async DIO, the conversion needs to be defered when - * the IO is completed. The ext4 end_io callback function - * will be called to take care of the conversion work. - * Here for async case, we allocate an io_end structure to - * hook to the iocb. - */ - iocb->private = NULL; - EXT4_I(inode)->cur_aio_dio = NULL; - if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode, GFP_NOFS); - if (!iocb->private) - return -ENOMEM; - /* - * we save the io structure for current async - * direct IO, so that later ext4_map_blocks() - * could flag the io structure whether there - * is a unwritten extents needs to be converted - * when IO is completed. - */ - EXT4_I(inode)->cur_aio_dio = iocb->private; + ext4_io_end_t *io_end = NULL; + + /* Use the old path for reads and writes beyond i_size. */ + if (rw != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(rw, iocb, iter, offset); + + BUG_ON(iocb->private == NULL); + + /* + * Make all waiters for direct IO properly wait also for extent + * conversion. This also disallows race between truncate() and + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + */ + if (rw == WRITE) + atomic_inc(&inode->i_dio_count); + + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); + + if (overwrite) { + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } + + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as + * unwritten to prevent parallel buffered read to expose + * the stale data before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block will + * just simply mark the buffer mapped but still keep the + * extents unwritten. + * + * For non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * For async DIO, the conversion needs to be deferred when the + * IO is completed. The ext4 end_io callback function will be + * called to take care of the conversion work. Here for async + * case, we allocate an io_end structure to hook to the iocb. + */ + iocb->private = NULL; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; } + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); + /* + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write, - ext4_end_io_dio, - NULL, - DIO_LOCKING | DIO_SKIP_HOLES); - if (iocb->private) - EXT4_I(inode)->cur_aio_dio = NULL; + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, + offset, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + + /* + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. + */ + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); /* - * The io_end structure takes a reference to the inode, - * that structure needs to be destroyed and the - * reference to the inode need to be dropped, when IO is - * complete, even with 0 byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will be - * desctroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since - * VFS direct IO won't invoke the end_io call back function, - * we need to free the end_io structure here. + * When no IO was submitted ext4_end_io_dio() was not + * called so we have to put iocb's reference. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { + WARN_ON(iocb->private != io_end); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + ext4_put_io_end(io_end); iocb->private = NULL; - } else if (ret > 0 && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } - return ret; + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(NULL, inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } + +retake_lock: + if (rw == WRITE) + inode_dio_done(inode); + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); } - /* for write the the end of file case, we fall back to old way */ - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ret; } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -2994,13 +3242,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_should_journal_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + /* Let buffer I/O handle the inline data case. */ + if (ext4_has_inline_data(inode)) + return 0; + + trace_ext4_direct_IO_enter(inode, offset, count, rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + ret = ext4_ind_direct_IO(rw, iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3023,27 +3274,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { +static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, + .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3057,11 +3294,12 @@ static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, + .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, @@ -3072,7 +3310,7 @@ static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, - .writepages = ext4_da_writepages, + .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .bmap = ext4_bmap, @@ -3086,34 +3324,41 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); + break; + case EXT4_INODE_JOURNAL_DATA_MODE: + inode->i_mapping->a_ops = &ext4_journalled_aops; + return; + default: + BUG(); + } + if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; else - inode->i_mapping->a_ops = &ext4_journalled_aops; + inode->i_mapping->a_ops = &ext4_aops; } - /* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' */ -int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags) +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) { + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, max, pos; + ext4_lblk_t iblock; struct inode *inode = mapping->host; + struct buffer_head *bh; struct page *page; int err = 0; @@ -3122,67 +3367,12 @@ int ext4_discard_partial_page_buffers(handle_t *handle, if (!page) return -ENOMEM; - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, - from, length, flags); - - unlock_page(page); - page_cache_release(page); - return err; -} - -/* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped. Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been locked. The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'. This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode: The files inode - * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the begining of the file) - * to begin discarding - * len: The length of bytes to discard - * flags: Optional flags that may be used: - * - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - * Only zero the regions of the page whose buffer heads - * have already been unmapped. This flag is appropriate - * for updateing the contents of a page whose blocks may - * have already been released, and we only want to zero - * out the regions that correspond to those released blocks. - * - * Returns zero on sucess or negative on failure. - */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned int offset = from & (PAGE_CACHE_SIZE-1); - unsigned int blocksize, max, pos; - ext4_lblk_t iblock; - struct buffer_head *bh; - int err = 0; - blocksize = inode->i_sb->s_blocksize; - max = PAGE_CACHE_SIZE - offset; - - if (index != page->index) - return -EINVAL; + max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between - * 'from' and the end of the page + * 'from' and the end of the block */ if (length > max || length < 0) length = max; @@ -3200,106 +3390,111 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, iblock++; pos += blocksize; } + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } - pos = offset; - while (pos < offset + length) { - unsigned int end_of_block, range_to_discard; - - err = 0; - - /* The length of space left to zero and unmap */ - range_to_discard = offset + length - pos; + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); - /* The length of space until the end of the block */ - end_of_block = blocksize - (pos & (blocksize-1)); + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); - /* - * Do not unmap or zero past end of block - * for this buffer head - */ - if (range_to_discard > end_of_block) - range_to_discard = end_of_block; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + err = ext4_jbd2_file_inode(handle, inode); + } +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} - /* - * Skip this buffer head if we are only zeroing unampped - * regions of the page - */ - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && - buffer_mapped(bh)) - goto next; - - /* If the range is block aligned, unmap */ - if (range_to_discard == blocksize) { - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - clear_buffer_uptodate(bh); - zero_user(page, pos, range_to_discard); - BUFFER_TRACE(bh, "Buffer discarded"); - goto next; - } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; - /* - * If this block is not completely contained in the range - * to be discarded, then it is not going to be released. Because - * we need to keep this block, we need to make sure this part - * of the page is uptodate before we modify it by writeing - * partial zeros on it. - */ - if (!buffer_mapped(bh)) { - /* - * Buffer head must be mapped before we can read - * from the block - */ - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto next; - } - } + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); + return ext4_block_zero_page_range(handle, mapping, from, length); +} - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt.*/ - if (!buffer_uptodate(bh)) - goto next; - } +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto next; - } + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); - zero_user(page, pos, range_to_discard); + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - - BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: - bh = bh->b_this_page; - iblock++; - pos += range_to_discard; + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; } - + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); return err; } @@ -3322,26 +3517,159 @@ int ext4_can_truncate(struct inode *inode) * @offset: The offset where the hole will begin * @len: The length of the hole * - * Returns: 0 on sucess or negative on failure + * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) { - struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_block_offset, last_block_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + if (!S_ISREG(inode->i_mode)) - return -ENOTSUPP; + return -EOPNOTSUPP; + + trace_ext4_punch_hole(inode, offset, length, 0); + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - /* TODO: Add support for non extent hole punching */ - return -ENOTSUPP; + mutex_lock(&inode->i_mutex); + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; } - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -ENOTSUPP; + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + } - return ext4_ext_punch_hole(file, offset, length); + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + + /* Now release the pages and zero block aligned part of pages*/ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + ret = ext4_zero_partial_blocks(handle, inode, offset, + length); + if (ret) + goto out_stop; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_free_hole_blocks(handle, inode, first_block, + stop_block); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; } /* @@ -3374,6 +3702,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + + /* + * There is a possibility that we're either freeing the inode + * or it's a completely new inode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -3384,10 +3724,74 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + ext4_inline_data_truncate(inode, &has_inline); + if (has_inline) + return; + } + + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); + ext4_ext_truncate(handle, inode); else - ext4_ind_truncate(inode); + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -3426,11 +3830,8 @@ static int __ext4_get_inode_loc(struct inode *inode, iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); bh = sb_getblk(sb, block); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, block, - "unable to read itable block"); - return -EIO; - } + if (unlikely(!bh)) + return -ENOMEM; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -3462,7 +3863,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -3498,16 +3899,16 @@ make_io: if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; + end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) @@ -3548,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -3610,6 +4013,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } +static inline void ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_find_inline_data_nolock(inode); + } else + EXT4_I(inode)->i_inline_off = 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -3619,6 +4035,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -3633,16 +4051,52 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + ret = -EIO; + goto bad_inode; + } + } else + ei->i_extra_isize = 0; + + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = raw_inode->i_generation; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + + if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + EXT4_ERROR_INODE(inode, "checksum invalid"); + ret = -EIO; + goto bad_inode; + } + inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (!(test_opt(inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -3651,8 +4105,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; @@ -3660,7 +4115,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete - * the process of deleting those. */ + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); @@ -3710,36 +4167,27 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - ret = -EIO; - goto bad_inode; - } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_iget_extra_inode(inode, raw_inode, ei); } - } else - ei->i_extra_isize = 0; + } EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -3749,17 +4197,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); + } else if (!ext4_has_inline_data(inode)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } } if (ret) goto bad_inode; @@ -3789,6 +4239,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); } else { ret = -EIO; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); @@ -3815,7 +4267,7 @@ static int ext4_inode_blocks_set(handle_t *handle, if (i_blocks <= ~0U) { /* - * i_blocks can be represnted in a 32 bit variable + * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); @@ -3858,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; int err = 0, rc, block; + int need_datasync = 0, set_large_file = 0; + uid_t i_uid; + gid_t i_gid; + + spin_lock(&ei->i_raw_lock); - /* For fields not not tracking in the in-memory inode, + /* For fields not tracked in the in-memory inode, * initialise them to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if (!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -3898,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) + if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + spin_unlock(&ei->i_raw_lock); goto out_brelse; + } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize != ext4_isize(raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; - ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, NULL, - EXT4_SB(sb)->s_sbh); - } + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -3942,25 +4389,43 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else + } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; + } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } + ext4_inode_csum_set(inode, raw_inode, ei); + + spin_unlock(&ei->i_raw_lock); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - - ext4_update_inode_fsync_trans(handle, inode, 0); + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } + ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -3972,21 +4437,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3998,15 +4462,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4016,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4026,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, @@ -4039,6 +4512,48 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } /* + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate + * buffers that are attached to a page stradding i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ + struct page *page; + unsigned offset; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = 0; + int ret; + + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* + * All buffers in the last page remain valid? Then there's nothing to + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == + * blocksize case + */ + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) + return; + while (1) { + page = find_lock_page(inode->i_mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + ret = __ext4_journalled_invalidatepage(page, offset, + PAGE_CACHE_SIZE - offset); + unlock_page(page); + page_cache_release(page); + if (ret != -EBUSY) + return; + commit_tid = 0; + read_lock(&journal->j_state_lock); + if (journal->j_committing_transaction) + commit_tid = journal->j_committing_transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (commit_tid) + jbd2_log_wait_commit(journal, commit_tid); + } +} + +/* * ext4_setattr() * * Called from notify_change. @@ -4075,14 +4590,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -4102,8 +4618,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE) { - inode_dio_wait(inode); + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4111,53 +4627,73 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } - } - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; - handle = ext4_journal_start(inode, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); + if (error) goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); goto err_out; } - } - } + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + /* + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). + */ + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else + i_size_write(inode, attr->ia_size); - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - ext4_truncate(inode); + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); + } + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, inode->i_size); } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); @@ -4172,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext4_std_error(inode->i_sb, error); @@ -4185,12 +4721,21 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode; - unsigned long delalloc_blocks; + unsigned long long delalloc_blocks; inode = dentry->d_inode; generic_fillattr(inode, stat); /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others doen't incorrectly think the file is completely sparse. + */ + if (unlikely(ext4_has_inline_data(inode))) + stat->blocks += (stat->size + 511) >> 9; + + /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with @@ -4200,17 +4745,18 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), + EXT4_I(inode)->i_reserved_data_blocks); + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_ind_trans_blocks(inode, nrblocks, chunk); - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); } /* @@ -4219,12 +4765,13 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over - * different block groups too. If they are contiuguous, with flexbg, + * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -4232,14 +4779,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) int ret = 0; /* - * How many index blocks need to touch to modify nrblocks? - * The "Chunk" flag indicating whether the nrblocks is - * physically contiguous on disk - * - * For Direct IO and fallocate, they calls get_block to allocate - * one single extent at a time, so they could set the "Chunk" flag + * How many index blocks need to touch to map @lblocks logical blocks + * to @pextents physical extents? */ - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; @@ -4247,12 +4790,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; - if (chunk) - groups += 1; - else - groups += nrblocks; - + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -4283,7 +4821,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, 0); + ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) @@ -4314,7 +4852,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (test_opt(inode->i_sb, I_VERSION)) + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); /* the do_update_inode consumes one bh->b_count */ @@ -4395,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { @@ -4467,7 +4997,7 @@ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; @@ -4541,6 +5071,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + jbd2_journal_lock_updates(journal); /* @@ -4560,10 +5094,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4587,17 +5122,14 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long len; int ret; struct file *file = vma->vm_file; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; - /* - * This check is racy but catches the common case. We rely on - * __block_page_mkwrite() to do a reliable check. - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -4628,10 +5160,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) { + if (!ext4_walk_page_buffers(NULL, page_buffers(page), + 0, len, NULL, + ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = VM_FAULT_LOCKED; goto out; } @@ -4643,14 +5176,15 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } ret = __block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { - if (walk_page_buffers(handle, page_buffers(page), 0, + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; @@ -4665,5 +5199,6 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eee25591b8..0f2252ec274 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -20,9 +20,187 @@ #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode1); + ext4_es_lru_del(inode2); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + lock_two_nondirectories(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto journal_err_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + ext4_journal_stop(handle); + ext4_double_up_write_data_sem(inode, inode_bl); + +journal_err_out: + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + unlock_two_nondirectories(inode, inode_bl); + iput(inode_bl); + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; @@ -38,7 +216,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) handle_t *handle = NULL; int err, migrate = 0; struct ext4_iloc iloc; - unsigned int oldflags; + unsigned int oldflags, mask, i; unsigned int jflag; if (!inode_owner_or_capable(inode)) @@ -83,17 +261,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ @@ -104,7 +273,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } else if (oldflags & EXT4_EOFBLOCKS_FL) ext4_truncate(inode); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; @@ -115,9 +284,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) goto flags_err; - flags = flags & EXT4_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; - ei->i_flags = flags; + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } ext4_set_inode_flags(inode); inode->i_ctime = ext4_current_time(inode); @@ -132,8 +306,13 @@ flags_err: err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write_file(filp); @@ -152,6 +331,13 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + err = mnt_want_write_file(filp); if (err) return err; @@ -161,7 +347,7 @@ flags_out: } mutex_lock(&inode->i_mutex); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; @@ -221,7 +407,7 @@ group_extend_out: case EXT4_IOC_MOVE_EXT: { struct move_extent me; - struct file *donor_filp; + struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || @@ -233,11 +419,11 @@ group_extend_out: return -EFAULT; me.moved_len = 0; - donor_filp = fget(me.donor_fd); - if (!donor_filp) + donor = fdget(me.donor_fd); + if (!donor.file) return -EBADF; - if (!(donor_filp->f_mode & FMODE_WRITE)) { + if (!(donor.file->f_mode & FMODE_WRITE)) { err = -EBADF; goto mext_out; } @@ -246,23 +432,23 @@ group_extend_out: EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); if (err) goto mext_out; - err = ext4_move_extents(filp, donor_filp, me.orig_start, + err = ext4_move_extents(filp, donor.file, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); - mnt_drop_write(filp->f_path.mnt); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; mext_out: - fput(donor_filp); + fdput(donor); return err; } @@ -301,6 +487,9 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input.group); group_add_out: ext4_resize_end(sb); return err; @@ -342,10 +531,15 @@ group_add_out: return err; } + case EXT4_IOC_SWAP_BOOT: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + return swap_inode_boot_loader(sb, inode); + case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { @@ -354,31 +548,16 @@ group_add_out: return -EOPNOTSUPP; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with meta_bg"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } - if (n_blocks_count > MAX_32_NUM && - !EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, - "File system only supports 32-bit block numbers"); - return -EOPNOTSUPP; - } - err = ext4_resize_begin(sb); if (err) return err; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) goto resizefs_out; @@ -390,7 +569,12 @@ group_add_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); + if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + resizefs_out: ext4_resize_end(sb); return err; @@ -408,13 +592,6 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "FITRIM not supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -431,6 +608,8 @@ resizefs_out: return 0; } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); default: return -ENOTTY; @@ -495,6 +674,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_MOVE_EXT: case FITRIM: case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb990b21c69..2dcb936be90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,11 +21,20 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" -#include <linux/debugfs.h> +#include <linux/log2.h> +#include <linux/module.h> #include <linux/slab.h> #include <trace/events/ext4.h> +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -339,7 +348,7 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -357,7 +366,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -395,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -425,7 +441,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -436,10 +452,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); - return EXT4_MB_BITMAP(e4b); + return e4b->bd_bitmap; } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -588,7 +604,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -706,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -735,13 +752,18 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -754,6 +776,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -782,11 +822,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; - struct buffer_head **bh; + struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; @@ -806,24 +846,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= ngroups) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - grinfo = ext4_get_group_info(sb, first_group + i); + grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -834,73 +873,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; goto out; - - if (bitmap_uptodate(bh[i])) - continue; - - lock_buffer(bh[i]); - if (bitmap_uptodate(bh[i])) { - unlock_buffer(bh[i]); - continue; - } - ext4_lock_group(sb, first_group + i); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_bitmap_uptodate(bh[i]); - set_buffer_uptodate(bh[i]); - ext4_unlock_group(sb, first_group + i); - unlock_buffer(bh[i]); - continue; } - ext4_unlock_group(sb, first_group + i); - if (buffer_uptodate(bh[i])) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh[i]); - unlock_buffer(bh[i]); - continue; - } - get_bh(bh[i]); - /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. - */ - set_bitmap_uptodate(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug(1, "read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page; i++) - if (bh[i]) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page; i++) - if (bh[i] && !buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } - err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1004,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1016,10 +1005,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block++; pnum = block / blocks_per_page; - poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1051,6 +1039,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) struct page *page; int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* @@ -1059,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { @@ -1077,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); if (e4b.bd_buddy_page == NULL) { /* @@ -1097,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); err: ext4_mb_put_buddy_page_lock(&e4b); return ret; @@ -1122,6 +1111,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -1155,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* @@ -1182,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); @@ -1211,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -1250,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1284,6 +1284,33 @@ static void mb_clear_bits(void *bm, int cur, int len) } } +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1302,18 +1329,95 @@ void ext4_set_bits(void *bm, int cur, int len) } } +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) + int first, int count) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1321,82 +1425,77 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); - if (block && max) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u); block bitmap corrupt.", + block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); + mb_regenerate_buddy(e4b); + goto done; + } + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; - else if (!block && !max) + else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; - - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); - - if (!buddy2) - break; - - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, +static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; - int max; + int max, order; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); - buddy = mb_find_buddy(e4b, order, &max); + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { @@ -1406,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, return 0; } - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; @@ -1423,13 +1519,13 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); @@ -1466,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1511,7 +1607,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1597,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; @@ -1689,7 +1785,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; @@ -1710,18 +1806,27 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; + if (grp->bb_free == 0) + return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; @@ -1810,7 +1915,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1836,7 +1941,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, @@ -1850,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1870,7 +1975,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -1888,9 +1993,10 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -1910,6 +2016,15 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); + free = grp->bb_free; + if (free == 0) + return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -1917,10 +2032,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } - free = grp->bb_free; fragments = grp->bb_fragments; - if (free == 0) - return 0; if (fragments == 0) return 0; @@ -1928,15 +2040,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, case 0: BUG_ON(ac->ac_2order == 0); - if (grp->bb_largest_free_order < ac->ac_2order) - return 0; - /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) @@ -2027,7 +2143,12 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - if (group == ngroups) + cond_resched(); + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ @@ -2051,7 +2172,7 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -2124,8 +2245,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct super_block *sb = seq->private; ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; - int err; + int err, buddy_loaded = 0; struct ext4_buddy e4b; + struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; ext4_grpblk_t counters[16]; @@ -2142,15 +2264,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); - return 0; + grinfo = ext4_get_group_info(sb, group); + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + buddy_loaded = 1; } - ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); @@ -2175,7 +2303,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); @@ -2204,6 +2332,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) return cachep; } +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + ext4_kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} + /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) @@ -2224,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } @@ -2236,12 +2397,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } - memset(meta_group_info[i], 0, kmem_cache_size(cachep)); set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); @@ -2293,49 +2453,14 @@ static int ext4_mb_init_backend(struct super_block *sb) ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int num_meta_group_infos; - int num_meta_group_infos_max; - int array_size; + int err; struct ext4_group_desc *desc; struct kmem_cache *cachep; - /* This is the number of blocks used by GDT */ - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - - /* - * This is the total number of blocks used by GDT including - * the number of reserved blocks for GDT. - * The s_group_info array is allocated with this value - * to allow a clean online resize without a complex - * manipulation of pointer. - * The drawback is the unused memory when no resize - * occurs but it's very low in terms of pages - * (see comments below) - * Need to handle this properly when META_BG resizing is allowed - */ - num_meta_group_infos_max = num_meta_group_infos + - le16_to_cpu(es->s_reserved_gdt_blocks); + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; - /* - * array_size is the size of s_group_info array. We round it - * to the next power of two because this approximation is done - * internally by kmalloc so we can have some more memory - * for free here (e.g. may be used for META_BG resize). - */ - array_size = 1; - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); - return -ENOMEM; - } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); @@ -2363,7 +2488,7 @@ err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); - i = num_meta_group_infos; + i = sbi->s_group_info_size; while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); @@ -2422,7 +2547,7 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } -int ext4_mb_init(struct super_block *sb, int needs_recovery) +int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; @@ -2502,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out_free_groupinfo_slab; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2522,16 +2647,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; -out_free_groupinfo_slab: - ext4_groupinfo_destroy_slabs(); out: kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; @@ -2567,6 +2687,9 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2614,8 +2737,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -2637,96 +2758,67 @@ static inline int ext4_issue_discard(struct super_block *sb, * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_cluster, entry->count); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_unload_buddy(&e4b); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; - -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; - -static void __init ext4_create_debugfs_entry(void) -{ - debugfs_dir = debugfs_create_dir("ext4", NULL); - if (debugfs_dir) - debugfs_debug = debugfs_create_u8("mballoc-debug", - S_IRUGO | S_IWUSR, - debugfs_dir, - &mb_enable_debug); -} - -static void ext4_remove_debugfs_entry(void) -{ - debugfs_remove(debugfs_debug); - debugfs_remove(debugfs_dir); -} - -#else - -static void __init ext4_create_debugfs_entry(void) -{ -} - -static void ext4_remove_debugfs_entry(void) -{ -} - -#endif - int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -2741,14 +2833,13 @@ int __init ext4_init_mballoc(void) return -ENOMEM; } - ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } - ext4_create_debugfs_entry(); return 0; } @@ -2761,9 +2852,8 @@ void ext4_exit_mballoc(void) rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); + kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); - ext4_remove_debugfs_entry(); } @@ -2794,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!bitmap_bh) goto out_err; + BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; @@ -2806,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); + BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; @@ -2815,7 +2907,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " - "fs metadata\n", block, block+len); + "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2850,7 +2942,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); @@ -2865,8 +2958,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -2875,7 +2968,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - ext4_mark_super_dirty(sb); brelse(bitmap_bh); return err; } @@ -2911,7 +3003,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - loff_t size, orig_size, start_off; + loff_t size, start_off; + loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -3062,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -3123,13 +3216,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; - int len; - - if (pa && pa->pa_type == MB_INODE_PA) { - len = ac->ac_b_ex.fe_len; - pa->pa_free += len; - } + if (pa && pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; } /* @@ -3321,8 +3410,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_cluster, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -3373,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3386,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -3407,7 +3501,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3484,7 +3578,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) win = offs; ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_B2C(sbi, win); + EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3757,11 +3851,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -3876,7 +3966,7 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { @@ -3912,15 +4002,15 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - if (!mb_enable_debug || + if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -3936,9 +4026,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4045,8 +4134,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) - len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; @@ -4056,8 +4145,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4139,7 +4227,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, "Error loading buddy information for %u", group); @@ -4177,7 +4265,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ - rcu_read_lock(); + spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&tmp_pa->pa_lock); @@ -4201,12 +4289,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) if (!added) list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); - rcu_read_unlock(); + spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, - order, lg_prealloc_count); + order, lg_prealloc_count); return; } return ; @@ -4287,6 +4375,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4312,7 +4401,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { @@ -4339,7 +4428,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } } - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; @@ -4360,14 +4449,19 @@ repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); if (*errp) - goto errout; + goto discard_and_exit; /* as we've just preallocated more space than - * user requested orinally, we store allocated + * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4382,10 +4476,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) - errout: + } else if (*errp) { ext4_discard_allocated_blocks(ac); - else { + goto errout; + } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4396,6 +4490,7 @@ repeat: *errp = -ENOSPC; } +errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; @@ -4428,9 +4523,9 @@ out: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4452,8 +4547,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - cluster = new_entry->start_cluster; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4466,10 +4561,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (cluster < entry->start_cluster) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (cluster >= (entry->start_cluster + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4581,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); - if (can_merge(entry, new_entry)) { - new_entry->start_cluster = entry->start_cluster; - new_entry->count += entry->count; + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); - if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4532,17 +4622,18 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -4568,10 +4659,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, BUG_ON(bh && (count > 1)); for (i = 0; i < count; i++) { + cond_resched(); if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - if (unlikely(!tbh)) + if (!tbh) continue; ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); @@ -4595,7 +4687,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4609,7 +4701,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) @@ -4624,6 +4716,10 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. @@ -4633,7 +4729,7 @@ do_more: EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - count_clusters = EXT4_B2C(sbi, count); + count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4691,15 +4787,21 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + retry: + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { - err = -ENOMEM; - goto error_return; + /* + * We use a retry loop because + * ext4_free_blocks() is not allowed to fail. + */ + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; } - new_entry->start_cluster = bit; - new_entry->group = block_group; - new_entry->count = count_clusters; - new_entry->t_tid = handle->h_transaction->t_tid; + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -4709,6 +4811,16 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4716,22 +4828,33 @@ do_more: ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } - ext4_mb_unload_buddy(&e4b); - - freed += count; - - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -4749,7 +4872,6 @@ do_more: put_bh(bitmap_bh); goto do_more; } - ext4_mark_super_dirty(sb); error_return: brelse(bitmap_bh); ext4_std_error(sb, err); @@ -4760,7 +4882,7 @@ error_return: * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to add to the block group + * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. @@ -4860,15 +4982,16 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, mb_free_blocks(NULL, &e4b, bit, count); blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, blocks_freed)); + EXT4_NUM_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4901,10 +5024,13 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, +static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; + int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -4920,9 +5046,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; } /** @@ -4951,7 +5078,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret; + int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); @@ -4971,15 +5098,18 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; - while (start < max) { - start = mb_find_next_zero_bit(bitmap, max, start); - if (start >= max) + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) break; - next = mb_find_next_bit(bitmap, max, start); + next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; count += next - start; } free_count += next - start; @@ -5000,8 +5130,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, break; } - if (!ret) + if (!ret) { + ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -5009,7 +5141,7 @@ out: ext4_debug("trimmed %d blocks in the group %d\n", count, group); - return count; + return ret; } /** @@ -5027,37 +5159,38 @@ out: int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_group_info *grp; - ext4_group_t first_group, last_group; - ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; - uint64_t start, len, minlen, trimmed = 0; + uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; - len = range->len >> sb->s_blocksize_bits; - minlen = range->minlen >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); - if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) return -EINVAL; - if (start + len <= first_data_blk) + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) goto out; - if (start < first_data_blk) { - len -= first_data_blk - start; + if (start < first_data_blk) start = first_data_blk; - } - /* Determine first and last group to examine based on start and len */ + /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -5069,31 +5202,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } /* - * For all the groups except the last one, last block will - * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case start + - * len < EXT4_BLOCKS_PER_GROUP(sb). + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() */ - if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) - last_cluster = first_cluster + len; - len -= last_cluster - first_cluster; + if (group == last_group) + end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - last_cluster, minlen); + end, minlen); if (cnt < 0) { ret = cnt; break; } + trimmed += cnt; } - trimmed += cnt; + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ first_cluster = 0; } - range->len = trimmed * sb->s_blocksize; if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3285e..d634e183b4d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,18 +37,18 @@ /* */ #ifdef CONFIG_EXT4_DEBUG -extern u8 mb_enable_debug; +extern ushort ext4_mballoc_debug; #define mb_debug(n, fmt, a...) \ do { \ - if ((n) <= mb_enable_debug) { \ + if ((n) <= ext4_mballoc_debug) { \ printk(KERN_DEBUG "(%s, %d): %s: ", \ __FILE__, __LINE__, __func__); \ printk(fmt, ## a); \ } \ } while (0) #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ @@ -65,11 +65,6 @@ extern u8 mb_enable_debug; #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ @@ -96,21 +91,23 @@ extern u8 mb_enable_debug; struct ext4_free_data { - /* this links the free block information from group_info */ - struct rb_node node; + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ - /* this links the free block information from ext4_sb_info */ - struct list_head list; + /* this links the free block information from group_info */ + struct rb_node efd_node; /* group which free block extent belongs */ - ext4_group_t group; + ext4_group_t efd_group; /* free block extent */ - ext4_grpblk_t start_cluster; - ext4_grpblk_t count; + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; /* transaction which freed this extent */ - tid_t t_tid; + tid_t efd_tid; }; struct ext4_prealloc_space { @@ -178,8 +175,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; @@ -210,8 +205,6 @@ struct ext4_buddy { __u16 bd_blkbits; ext4_group_t bd_group; }; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7d6bb0acfa..ec092437d3e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include "ext4_jbd2.h" +#include "ext4_extents.h" /* * The contiguous blocks details which can be @@ -38,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - path = ext4_ext_find_extent(inode, lb->first_block, NULL); + path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); @@ -425,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) return retval; } return retval; - } int ext4_ext_migrate(struct inode *inode) @@ -455,23 +455,26 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) - + 1); + /* + * Worst case we can touch the allocation bitmaps, a bgd + * block, and a block to link in the orphan list. We do need + * need to worry about credits for modifying the quota inode. + */ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, + 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { retval = PTR_ERR(handle); return retval; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; - owner[0] = inode->i_uid; - owner[1] = inode->i_gid; + owner[0] = i_uid_read(inode); + owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = PTR_ERR(inode); + retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); return retval; } @@ -491,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode) * superblock modification. * * For the tmp_inode we already have committed the - * trascation that created the inode. Later as and + * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* @@ -502,11 +505,11 @@ int ext4_ext_migrate(struct inode *inode) * with i_data_sem held to prevent racing with block * allocation. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { /* * It is impossible to update on-disk structures without @@ -602,3 +605,64 @@ out: return retval; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 7ea4ba4eff2..32bce844c2e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -6,18 +6,57 @@ #include "ext4.h" +/* Checksumming functions */ +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct mmp_struct, mmp_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + + return cpu_to_le32(csum); +} + +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); +} + +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); +} + /* * Write the MMP block using WRITE_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct buffer_head *bh) +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { + struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); + sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return 1; @@ -41,25 +80,28 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, * is not blocked in the elevator. */ if (!*bh) *bh = sb_getblk(sb, mmp_block); + if (!*bh) + return -ENOMEM; if (*bh) { get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { brelse(*bh); *bh = NULL; } } - if (!*bh) { + if (unlikely(!*bh)) { ext4_warning(sb, "Error while reading MMP block %llu", mmp_block); return -EIO; } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || + !ext4_mmp_csum_verify(sb, mmp)) return -EINVAL; return 0; @@ -120,7 +162,7 @@ static int kmmpd(void *data) mmp->mmp_time = cpu_to_le64(get_seconds()); last_update_time = jiffies; - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); /* * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. @@ -200,7 +242,7 @@ static int kmmpd(void *data) mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); mmp->mmp_time = cpu_to_le64(get_seconds()); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); failed: kfree(data); @@ -217,7 +259,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - get_random_bytes(&new_seq, sizeof(u32)); + new_seq = prandom_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; @@ -257,8 +299,8 @@ int ext4_multi_mount_protect(struct super_block *sb, * If check_interval in MMP block is larger, use that instead of * update_interval from the superblock. */ - if (mmp->mmp_check_interval > mmp_check_interval) - mmp_check_interval = mmp->mmp_check_interval; + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); seq = le32_to_cpu(mmp->mmp_seq); if (seq == EXT4_MMP_SEQ_CLEAN) @@ -299,7 +341,7 @@ skip: seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); if (retval) goto failed; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c5826c623e7..2484c7ec6a7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" /** * get_ext_path - Find an extent path for designated logical block number. @@ -31,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -54,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, static void copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) { - if (ext4_ext_is_uninitialized(src)) - ext4_ext_mark_uninitialized(dest); + if (ext4_ext_is_unwritten(src)) + ext4_ext_mark_unwritten(dest); else dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); } @@ -73,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -static int +int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { @@ -141,66 +144,34 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * mext_check_null_inode - NULL check for two inodes + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + * Acquire write lock of i_data_sem of the two inodes */ -static int -mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function, unsigned int line) +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { - int ret = 0; - - if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 NULL inode2 %lu", inode2->i_ino); - ret = -EIO; - } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 %lu inode2 NULL", inode1->i_ino); - ret = -EIO; - } - return ret; -} - -/** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire write lock of i_data_sem of the two inodes (orig and donor) by - * i_ino order. - */ -static void -double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; + if (first < second) { + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } else { + down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; } - - down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -420,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, if (depth) { /* Register to journal */ + BUFFER_TRACE(orig_path->p_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, orig_path->p_bh); if (ret) return ret; @@ -438,18 +410,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, mext_insert_inside_block(o_start, o_end, start_ext, new_ext, end_ext, eh, range_to_move); - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; + return ext4_ext_dirty(handle, orig_inode, orig_path); } /** @@ -604,9 +565,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, diff = donor_off - le32_to_cpu(tmp_dext->ee_block); ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - tmp_dext->ee_block = - cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + le32_add_cpu(&tmp_dext->ee_block, diff); + le16_add_cpu(&tmp_dext->ee_len, -diff); if (max_count < ext4_ext_get_actual_len(tmp_dext)) tmp_dext->ee_len = cpu_to_le16(max_count); @@ -629,6 +589,44 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, } /** + * mext_check_coverage - Check that all extents in range has the same type + * + * @inode: inode in question + * @from: block offset of inode + * @count: block count to be checked + * @unwritten: extents expected to be unwritten + * @err: pointer to save error value + * + * Return 1 if all extents in range has expected type, and zero otherwise. + */ +static int +mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, + int unwritten, int *err) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext; + int ret = 0; + ext4_lblk_t last = from + count; + while (from < last) { + *err = get_ext_path(inode, from, &path); + if (*err) + goto out; + ext = path[ext_depth(inode)].p_ext; + if (unwritten != ext4_ext_is_unwritten(ext)) + goto out; + from += ext4_ext_get_actual_len(ext); + ext4_ext_drop_refs(path); + } + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; +} + +/** * mext_replace_branches - Replace original extents with new extents * * @handle: journal handle @@ -663,8 +661,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); @@ -681,6 +684,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; + if (unlikely(!dext)) + goto missing_donor_extent; tmp_dext = *dext; *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, @@ -691,7 +696,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - if (!dext) { + if (unlikely(!dext)) { + missing_donor_extent: EXT4_ERROR_INODE(donor_inode, "The extent for donor must be found"); *err = -EIO; @@ -723,6 +729,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, donor_off += dext_alen; orig_off += dext_alen; + BUG_ON(replaced_count > count); /* Already moved the expected blocks */ if (replaced_count >= count) break; @@ -761,12 +768,123 @@ out: kfree(donor_path); } - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + return replaced_count; +} - double_up_write_data_sem(orig_inode, donor_inode); +/** + * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * @index: page index + * @page: result page vector + * + * Grab two locked pages for inode's by inode order + */ +static int +mext_page_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index, struct page *page[2]) +{ + struct address_space *mapping[2]; + unsigned fl = AOP_FLAG_NOFS; - return replaced_count; + BUG_ON(!inode1 || !inode2); + if (inode1 < inode2) { + mapping[0] = inode1->i_mapping; + mapping[1] = inode2->i_mapping; + } else { + mapping[0] = inode2->i_mapping; + mapping[1] = inode1->i_mapping; + } + + page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + if (!page[0]) + return -ENOMEM; + + page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + if (!page[1]) { + unlock_page(page[0]); + page_cache_release(page[0]); + return -ENOMEM; + } + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); + if (inode1 > inode2) { + struct page *tmp; + tmp = page[0]; + page[0] = page[1]; + page[1] = tmp; + } + return 0; +} + +/* Force page buffers uptodate w/o dropping page's lock */ +static int +mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + sector_t block; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, block_start, block_end; + int i, err, nr = 0, partial = 0; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (PageUptodate(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + head = page_buffers(page); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + continue; + } + if (buffer_uptodate(bh)) + continue; + if (!buffer_mapped(bh)) { + err = ext4_get_block(inode, block, bh, 0); + if (err) { + SetPageError(page); + return err; + } + if (!buffer_mapped(bh)) { + zero_user(page, block_start, blocksize); + set_buffer_uptodate(bh); + continue; + } + } + BUG_ON(nr >= MAX_BUF_PER_PAGE); + arr[nr++] = bh; + } + /* No io required */ + if (!nr) + goto out; + + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); + if (err) + return err; + } + } +out: + if (!partial) + SetPageUptodate(page); + return 0; } /** @@ -777,7 +895,7 @@ out: * @orig_page_offset: page index on original file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped - * @uninit: orig extent is uninitialized or not + * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents @@ -788,31 +906,28 @@ out: static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit, int *err) + int block_len_in_page, int unwritten, int *err) { - struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct address_space *mapping = orig_inode->i_mapping; - struct buffer_head *bh; - struct page *page = NULL; - const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *orig_inode = file_inode(o_filp); + struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; - long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; - void *fsdata; - int i, jblocks; - int err2 = 0; + int err2, jblocks, retries = 0; int replaced_count = 0; + int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ +again: + *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; - handle = ext4_journal_start(orig_inode, jblocks); + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; @@ -824,21 +939,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; - /* - * If orig extent is uninitialized one, - * it's not necessary force the page into memory - * and then force it to be written out again. - * Just swap data blocks between orig and donor. - */ - if (uninit) { - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); - goto out2; - } - - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { @@ -858,75 +958,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, replaced_size = data_size; - *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, - &page, &fsdata); + *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, + pagep); if (unlikely(*err < 0)) - goto out; - - if (!PageUptodate(page)) { - mapping->a_ops->readpage(o_filp, page); - lock_page(page); - } - + goto stop_journal; /* - * try_to_release_page() doesn't call releasepage in writeback mode. - * We should care about the order of writing to the same file - * by multiple move extent processes. - * It needs to call wait_on_page_writeback() to wait for the - * writeback of the page. + * If orig extent was unwritten it can become initialized + * at any time after i_data_sem was dropped, in order to + * serialize with delalloc we have recheck extent while we + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. */ - wait_on_page_writeback(page); + if (unwritten) { + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to + * fallback to data copying */ + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; - /* Release old bh and drop refs */ - try_to_release_page(page, 0); + unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + if (!unwritten) { + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } + if ((page_has_private(pagep[0]) && + !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && + !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto drop_data_sem; + } + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + drop_data_sem: + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto unlock_pages; + } +data_copy: + *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + if (*err) + goto unlock_pages; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ + if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto unlock_pages; + } replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page, - &err2); - if (err2) { + orig_blk_offset, + block_len_in_page, err); + if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto out; - } - - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); - - bh = page_buffers(page); - for (i = 0; i < data_offset_in_page; i++) - bh = bh->b_this_page; - - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, - (sector_t)(orig_blk_offset + i), bh, 0); - if (*err < 0) - goto out; - - if (bh->b_this_page != NULL) - bh = bh->b_this_page; + goto unlock_pages; } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ + *err = __block_write_begin(pagep[0], from, replaced_size, + ext4_get_block); + if (!*err) + *err = block_commit_write(pagep[0], from, from + replaced_size); - *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, - page, fsdata); - page = NULL; - -out: - if (unlikely(page)) { - if (PageLocked(page)) - unlock_page(page); - page_cache_release(page); - ext4_journal_stop(handle); - } -out2: + if (unlikely(*err < 0)) + goto repair_branches; + + /* Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss */ + *err = ext4_jbd2_file_inode(handle, orig_inode); + +unlock_pages: + unlock_page(pagep[0]); + page_cache_release(pagep[0]); + unlock_page(pagep[1]); + page_cache_release(pagep[1]); +stop_journal: ext4_journal_stop(handle); - - if (err2) - *err = err2; - + /* Buffer was busy because probably is pinned to journal transaction, + * force transaction commit may help to free it. */ + if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, + &retries)) + goto again; return replaced_count; + +repair_branches: + /* + * This should never ever happen! + * Extents are swapped already, but we are not able to copy data. + * Try to swap extents to it's original places + */ + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, + orig_blk_offset, + block_len_in_page, &err2); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + if (replaced_count != block_len_in_page) { + EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), + "Unable to copy data block," + " data will be lost."); + *err = -EIO; + } + replaced_count = 0; + goto unlock_pages; } /** @@ -969,14 +1114,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* Files should be in the same ext4 FS */ - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " @@ -1002,7 +1139,6 @@ mext_check_arguments(struct inode *orig_inode, } if ((orig_start >= EXT_MAX_BLOCKS) || - (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " @@ -1067,73 +1203,6 @@ mext_check_arguments(struct inode *orig_inode, } /** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode structure - * @inode2: the inode structure - * - * Lock two inodes' i_mutex by i_ino order. - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ -static int -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) -{ - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1 == inode2) { - mutex_lock(&inode1->i_mutex); - goto out; - } - - if (inode1->i_ino < inode2->i_ino) { - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); - } - -out: - return ret; -} - -/** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode that is released first - * @inode2: the inode that is released second - * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ - -static int -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); - -out: - return ret; -} - -/** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file @@ -1179,24 +1248,31 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_start, __u64 donor_start, __u64 len, __u64 *moved_len) { - struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct inode *orig_inode = file_inode(o_filp); + struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; ext4_lblk_t block_start = orig_start; ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret1, ret2, depth, last_extent = 0; + int ret, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; - int uninit; + int unwritten; + + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { + /* orig and donor should be different inodes */ + if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", + "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -1208,18 +1284,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - + /* TODO: This is non obvious task to swap blocks for inodes with full + jornaling enabled */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + return -EINVAL; + } /* Protect orig and donor inodes against a truncate */ - ret1 = mext_inode_double_lock(orig_inode, donor_inode); - if (ret1 < 0) - return ret1; + lock_two_nondirectories(orig_inode, donor_inode); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(orig_inode); + ext4_inode_block_unlocked_dio(donor_inode); + inode_dio_wait(orig_inode); + inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); - if (ret1) + if (ret) goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; @@ -1227,13 +1312,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (file_end < block_end) len -= block_end - file_end; - ret1 = get_ext_path(orig_inode, block_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &orig_path); + if (ret) goto out; /* Get path structure to check the hole */ - ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret) goto out; depth = ext_depth(orig_inode); @@ -1252,13 +1337,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } seq_start = le32_to_cpu(ext_cur->ee_block); @@ -1272,7 +1357,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret1 = -EINVAL; + ret = -EINVAL; goto out; } @@ -1292,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1307,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, !last_extent) continue; - /* Is original extent is uninitialized */ - uninit = ext4_ext_is_uninitialized(ext_prev); + /* Is original extent is unwritten */ + unwritten = ext4_ext_is_unwritten(ext_prev); data_offset_in_page = seq_start % blocks_per_page; @@ -1339,7 +1424,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * b. racing with ->readpage, ->write_begin, and ext4_get_block * in move_extent_per_page */ - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1348,19 +1433,19 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit, - &ret1); + block_len_in_page, + unwritten, &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - if (ret1 < 0) + if (ret < 0) break; if (*moved_len > len) { EXT4_ERROR_INODE(orig_inode, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); - ret1 = -EIO; + ret = -EIO; break; } @@ -1373,23 +1458,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } - double_down_write_data_sem(orig_inode, donor_inode); - if (ret1 < 0) + ext4_double_down_write_data_sem(orig_inode, donor_inode); + if (ret < 0) break; /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - ret1 = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret) break; ext_cur = holecheck_path[depth].p_ext; @@ -1411,13 +1496,10 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - double_up_write_data_sem(orig_inode, donor_inode); - ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); + ext4_inode_resume_unlocked_dio(orig_inode); + ext4_inode_resume_unlocked_dio(donor_inode); + unlock_two_nondirectories(orig_inode, donor_inode); - if (ret1) - return ret1; - else if (ret2) - return ret2; - - return 0; + return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2043f482375..3520ab8a663 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -47,24 +47,111 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, - ext4_lblk_t *block, int *err) + ext4_lblk_t *block) { struct buffer_head *bh; + int err = 0; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && + ((inode->i_size >> 10) >= + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, err); - if (bh) { - inode->i_size += inode->i_sb->s_blocksize; - EXT4_I(inode)->i_disksize = inode->i_size; - *err = ext4_journal_get_write_access(handle, bh); - if (*err) { + bh = ext4_bread(handle, inode, *block, 1, &err); + if (!bh) + return ERR_PTR(err); + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); + } + return bh; +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +typedef enum { + EITHER, INDEX, DIRENT +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ + __ext4_read_dirblock((inode), (block), (type), __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + ext4_lblk_t block, + dirblock_type_t type, + unsigned int line) +{ + struct buffer_head *bh; + struct ext4_dir_entry *dirent; + int err = 0, is_dx_block = 0; + + bh = ext4_bread(NULL, inode, block, 0, &err); + if (!bh) { + if (err == 0) { + ext4_error_inode(inode, __func__, line, block, + "Directory hole found"); + return ERR_PTR(-EIO); + } + __ext4_warning(inode->i_sb, __func__, line, + "error reading directory block " + "(ino %lu, block %lu)", inode->i_ino, + (unsigned long) block); + return ERR_PTR(err); + } + dirent = (struct ext4_dir_entry *) bh->b_data; + /* Determine whether or not we have an index block */ + if (is_dx(inode)) { + if (block == 0) + is_dx_block = 1; + else if (ext4_rec_len_from_disk(dirent->rec_len, + inode->i_sb->s_blocksize) == + inode->i_sb->s_blocksize) + is_dx_block = 1; + } + if (!is_dx_block && type == INDEX) { + ext4_error_inode(inode, __func__, line, block, + "directory leaf block found instead of index block"); + return ERR_PTR(-EIO); + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + buffer_verified(bh)) + return bh; + + /* + * An empty leaf block can get mistaken for a index block; for + * this reason, we can only check the index checksum when the + * caller is sure it should be an index block. + */ + if (is_dx_block && type == INDEX) { + if (ext4_dx_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory index failed checksum"); brelse(bh); - bh = NULL; + return ERR_PTR(-EIO); + } + } + if (!is_dx_block) { + if (ext4_dirent_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory block failed checksum"); + brelse(bh); + return ERR_PTR(-EIO); } } return bh; @@ -145,6 +232,14 @@ struct dx_map_entry u16 size; }; +/* + * This goes at the end of each htree block. + */ +struct dx_tail { + u32 dt_reserved; + __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); @@ -180,6 +275,228 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +/* checksumming functions */ +void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) +{ + memset(t, 0, sizeof(struct ext4_dir_entry_tail)); + t->det_rec_len = ext4_rec_len_to_disk( + sizeof(struct ext4_dir_entry_tail), blocksize); + t->det_reserved_ft = EXT4_FT_DIR_CSUM; +} + +/* Walk through a dirent block to find a checksum "dirent" at the tail */ +static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, + struct ext4_dir_entry *de) +{ + struct ext4_dir_entry_tail *t; + +#ifdef PARANOID + struct ext4_dir_entry *d, *top; + + d = de; + top = (struct ext4_dir_entry *)(((void *)de) + + (EXT4_BLOCK_SIZE(inode->i_sb) - + sizeof(struct ext4_dir_entry_tail))); + while (d < top && d->rec_len) + d = (struct ext4_dir_entry *)(((void *)d) + + le16_to_cpu(d->rec_len)); + + if (d != top) + return NULL; + + t = (struct ext4_dir_entry_tail *)d; +#else + t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb)); +#endif + + if (t->det_reserved_zero1 || + le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + t->det_reserved_zero2 || + t->det_reserved_ft != EXT4_FT_DIR_CSUM) + return NULL; + + return t; +} + +static __le32 ext4_dirent_csum(struct inode *inode, + struct ext4_dir_entry *dirent, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + return cpu_to_le32(csum); +} + +static void warn_no_space_for_csum(struct inode *inode) +{ + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " + "checksum. Please run e2fsck -D.", inode->i_ino); +} + +int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + t = get_dirent_tail(inode, dirent); + if (!t) { + warn_no_space_for_csum(inode); + return 0; + } + + if (t->det_checksum != ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent)) + return 0; + + return 1; +} + +static void ext4_dirent_csum_set(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + t = get_dirent_tail(inode, dirent); + if (!t) { + warn_no_space_for_csum(inode); + return; + } + + t->det_checksum = ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent); +} + +int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + struct ext4_dir_entry *dirent, + int *offset) +{ + struct ext4_dir_entry *dp; + struct dx_root_info *root; + int count_offset; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; + else if (le16_to_cpu(dirent->rec_len) == 12) { + dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); + if (le16_to_cpu(dp->rec_len) != + EXT4_BLOCK_SIZE(inode->i_sb) - 12) + return NULL; + root = (struct dx_root_info *)(((void *)dp + 12)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; + count_offset = 32; + } else + return NULL; + + if (offset) + *offset = count_offset; + return (struct dx_countlimit *)(((void *)dirent) + count_offset); +} + +static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, + int count_offset, int count, struct dx_tail *t) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + __le32 save_csum; + int size; + + size = count_offset + (count * sizeof(struct dx_entry)); + save_csum = t->dt_checksum; + t->dt_checksum = 0; + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); + t->dt_checksum = save_csum; + + return cpu_to_le32(csum); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return 1; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return 1; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, + count, t)) + return 0; + return 1; +} + +static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + warn_no_space_for_csum(inode); + return; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); +} + +static inline int ext4_handle_dirty_dx_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + /* * p is at least 6 bytes before the end of page */ @@ -239,12 +556,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -354,8 +679,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail; + } root = (struct dx_root *) bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && @@ -447,9 +775,13 @@ dx_probe(const struct qstr *d_name, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail2; - at = entries = ((struct dx_node *) bh->b_data)->entries; + } + entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); @@ -468,7 +800,7 @@ fail2: fail: if (*err == ERR_BAD_DX_DIR) ext4_warning(dir->i_sb, - "Corrupt dir inode %ld, running e2fsck is " + "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); return NULL; } @@ -507,7 +839,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, { struct dx_frame *p; struct buffer_head *bh; - int err, num_frames = 0; + int num_frames = 0; __u32 bhash; p = frame; @@ -546,9 +878,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) - return err; /* Failure */ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; @@ -570,12 +902,13 @@ static int htree_dirblock_to_tree(struct file *dir_file, { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; - int err, count = 0; + int err = 0, count = 0; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) - return err; + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + @@ -583,13 +916,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse(bh); - return count; + /* silently ignore the rest of the block */ + break; } ext4fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -632,13 +963,24 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; + dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; @@ -700,6 +1042,15 @@ errout: return (err); } +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + d_name, offset, res_dir); +} /* * Directory block splitting, compacting @@ -774,13 +1125,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -static void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} - /* * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. * @@ -800,11 +1144,13 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 ** res_dir) +int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; @@ -812,8 +1158,8 @@ static inline int search_dirblock(struct buffer_head *bh, const char *name = d_name->name; int namelen = d_name->len; - de = (struct ext4_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ @@ -821,7 +1167,8 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -1; *res_dir = de; return 1; @@ -837,6 +1184,21 @@ static inline int search_dirblock(struct buffer_head *bh, return 0; } +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} /* * ext4_find_entry() @@ -851,7 +1213,8 @@ static inline int search_dirblock(struct buffer_head *bh, */ static struct buffer_head * ext4_find_entry (struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir) + struct ext4_dir_entry_2 **res_dir, + int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; @@ -872,6 +1235,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, d_name, res_dir, + &has_inline_data); + if (has_inline_data) { + if (inlined) + *inlined = 1; + return ret; + } + } + if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* @@ -936,6 +1311,17 @@ restart: brelse(bh); goto next; } + if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + goto next; + } + set_buffer_verified(bh); i = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { @@ -984,9 +1370,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto errout; - + } retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); @@ -1019,7 +1407,7 @@ errout: return NULL; } -static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de; @@ -1028,7 +1416,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1037,6 +1425,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EIO); } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); + return ERR_PTR(-EIO); + } inode = ext4_iget(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, @@ -1052,14 +1445,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - static const struct qstr dotdot = { - .name = "..", - .len = 2, - }; + static const struct qstr dotdot = QSTR_INIT("..", 2); struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1074,24 +1464,6 @@ struct dentry *ext4_get_parent(struct dentry *child) return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. @@ -1159,13 +1531,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; + struct ext4_dir_entry_tail *t; + int csum_size = 0; int err = 0, i; - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) { + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - goto errout; + *error = PTR_ERR(bh2); + return NULL; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1207,10 +1586,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de2, blocksize); + if (csum_size) { + t = EXT4_DIRENT_TAIL(data2, blocksize); + initialize_dirent_tail(t, blocksize); + + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1221,10 +1610,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (err) goto journal_error; - err = ext4_handle_dirty_metadata(handle, dir, frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1236,11 +1625,67 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); -errout: *error = err; return NULL; } +int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + + *dest_de = de; + return 0; +} + +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen) +{ + + int nlen, rlen; + + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = namelen; + memcpy(de->name, name, namelen); +} /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1256,30 +1701,20 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned int offset = 0; unsigned int blocksize = dir->i_sb->s_blocksize; - unsigned short reclen; - int nlen, rlen, err; - char *top; + int csum_size = 0; + int err; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); - reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; + err = ext4_find_dest_de(dir, inode, + bh, bh->b_data, blocksize - csum_size, + name, namelen, &de); + if (err) + return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1289,22 +1724,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } /* By now the buffer is marked for journaling */ - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; - de->name_len = namelen; - memcpy(de->name, name, namelen); + ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend @@ -1321,7 +1742,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, dir->i_version++; ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return 0; @@ -1342,6 +1763,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_frame frames[2], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; + struct ext4_dir_entry_tail *t; char *data1, *top; unsigned len; int retval; @@ -1349,9 +1771,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -1369,13 +1797,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, brelse(bh); return -EIO; } - len = ((char *) root) + blocksize - (char *) de; + len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ - bh2 = ext4_append(handle, dir, &block, &retval); - if (!(bh2)) { + bh2 = ext4_append(handle, dir, &block); + if (IS_ERR(bh2)) { brelse(bh); - return retval; + return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data1 = bh2->b_data; @@ -1385,8 +1813,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, top = data1 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), @@ -1411,8 +1846,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->bh = bh; bh = bh2; - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); + ext4_handle_dirty_dx_node(handle, dir, frame->bh); + ext4_handle_dirty_dirent_node(handle, dir, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); if (!de) { @@ -1448,16 +1883,33 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; struct super_block *sb; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, dentry, inode); + if (retval < 0) + return retval; + if (retval == 1) { + retval = 0; + return retval; + } + } + if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) @@ -1468,9 +1920,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - bh = ext4_bread(handle, dir, block, 0, &retval); - if(!bh) - return retval; + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) { brelse(bh); @@ -1482,12 +1935,18 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } - bh = ext4_append(handle, dir, &block, &retval); - if (!bh) - return retval; + bh = ext4_append(handle, dir, &block); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); if (retval == 0) @@ -1515,9 +1974,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, return err; entries = frame->entries; at = frame->at; - - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; goto cleanup; + } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1546,9 +2008,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, err = -ENOSPC; goto cleanup; } - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); goto cleanup; + } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); @@ -1586,7 +2050,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1612,7 +2076,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1627,38 +2091,35 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, journal_error: ext4_std_error(dir->i_sb, err); cleanup: - if (bh) - brelse(bh); + brelse(bh); dx_release(frames); return err; } /* - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry */ -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) +int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int i, err; + int i; i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + de = (struct ext4_dir_entry_2 *)entry_buf; + while (i < buf_size - csum_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -1669,12 +2130,6 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -1684,6 +2139,48 @@ static int ext4_delete_entry(handle_t *handle, return -ENOENT; } +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, + bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + /* * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, * since this indicates that nlinks count was previously 1. @@ -1718,8 +2215,8 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; } drop_nlink(inode); @@ -1737,33 +2234,31 @@ static int ext4_add_nondir(handle_t *handle, * with d_instantiate(). */ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -1774,95 +2269,174 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; if (!new_valid_dev(rdev)) return -EINVAL; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4_FS_XATTR inode->i_op = &ext4_special_inode_operations; -#endif err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - struct buffer_head *dir_block = NULL; - struct ext4_dir_entry_2 *de; - unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; - if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; - dquot_initialize(dir); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, + NULL, 0, NULL, + EXT4_HT_DIR, + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT4_XATTR_TRANS_BLOCKS); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + d_tmpfile(dentry, inode); + err = ext4_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_unlock_inode: + ext4_journal_stop(handle); + unlock_new_inode(inode); + return err; +} - inode->i_op = &ext4_dir_inode_operations; - inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); - if (!dir_block) - goto out_clear_inode; - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out_clear_inode; - de = (struct ext4_dir_entry_2 *) dir_block->b_data; +struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) +{ de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), blocksize); strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), - blocksize); + de->inode = cpu_to_le32(parent_ino); de->name_len = 2; + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(de->name_len), blocksize); strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); +} + +static int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) +{ + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + ext4_lblk_t block = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; + int err; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + + inode->i_size = 0; + dir_block = ext4_append(handle, inode, &block); + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out; + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + brelse(dir_block); + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, credits, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return -EMLINK; + + dquot_initialize(dir); + + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); +retry: + inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); @@ -1881,11 +2455,14 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - brelse(dir_block); - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -1902,18 +2479,23 @@ static int empty_dir(struct inode *inode) struct super_block *sb; int err = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + + err = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return err; + } + sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory lblock 0", err); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + EXT4_ERROR_INODE(inode, "invalid size"); return 1; } + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) + return 1; + de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || @@ -1930,24 +2512,18 @@ static int empty_dir(struct inode *inode) ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { unsigned int lblock; err = 0; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); - bh = ext4_bread(NULL, inode, lblock, 0, &err); - if (!bh) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory " - "lblock %u", err, lblock); - offset += sb->s_blocksize; - continue; - } + bh = ext4_read_dirblock(inode, lblock, EITHER); + if (IS_ERR(bh)) + return 1; de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; @@ -1964,26 +2540,37 @@ static int empty_dir(struct inode *inode) return 1; } -/* ext4_orphan_add() links an unlinked or truncated inode into a list of +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; + bool dirty = false; - if (!ext4_handle_valid(handle)) + if (!sbi->s_journal) return 0; - mutex_lock(&EXT4_SB(sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. + */ if (!list_empty(&EXT4_I(inode)->i_orphan)) - goto out_unlock; + return 0; /* * Orphan handling is only valid for files with data blocks @@ -1994,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) - goto out_unlock; + goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - goto out_unlock; + goto out; + + mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ - if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) - goto mem_insert; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ -mem_insert: - if (!err) - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_super(handle, sb); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); - ext4_std_error(inode->i_sb, err); +out: + ext4_std_error(sb, err); return err; } @@ -2047,46 +2637,52 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; - /* ext4_handle_valid() assumes a valid handle_t pointer */ - if (handle && !ext4_handle_valid(handle)) + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) - goto out; + return 0; - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT4_SB(inode->i_sb); + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + mutex_lock(&sbi->s_orphan_lock); jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (sbi->s_journal && !handle) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_err; + } + ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_super(handle, inode->i_sb); } else { struct ext4_iloc iloc2; struct inode *i_prev = @@ -2095,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); - out_err: ext4_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2122,25 +2718,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - inode = dentry->d_inode; retval = -EIO; @@ -2151,6 +2740,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!empty_dir(inode)) goto end_rmdir; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rmdir; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; @@ -2172,8 +2772,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_mark_inode_dirty(handle, dir); end_rmdir: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); return retval; } @@ -2183,7 +2784,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go @@ -2191,15 +2792,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_unlink; @@ -2209,6 +2803,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (le32_to_cpu(de->inode) != inode->i_ino) goto end_unlink; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_unlink; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + if (!inode->i_nlink) { ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", @@ -2229,8 +2834,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = 0; end_unlink: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); trace_ext4_unlink_exit(dentry, retval); return retval; } @@ -2266,19 +2872,13 @@ static int ext4_symlink(struct inode *dir, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } retry: - handle = ext4_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2308,7 +2908,7 @@ retry: * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified */ - handle = ext4_journal_start(dir, + handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { @@ -2331,8 +2931,12 @@ retry: } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2355,8 +2959,9 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2370,6 +2975,11 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -2381,190 +2991,457 @@ retry: return err; } -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) { + *retval = PTR_ERR(bh); + return NULL; + } + *parent_de = ext4_next_entry( + (struct ext4_dir_entry_2 *)bh->b_data, + inode->i_sb->s_blocksize); + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} + +struct ext4_renament { + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool is_dir; + int dir_nlink_delta; + + /* entry for "dentry" */ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + int inlined; + + /* entry for ".." in inode if it's a directory */ + struct buffer_head *dir_bh; + struct ext4_dir_entry_2 *parent_de; + int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, + &retval, &ent->parent_de, + &ent->dir_inlined); + if (!ent->dir_bh) + return retval; + if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) + return -EIO; + BUFFER_TRACE(ent->dir_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, ent->dir_bh); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, + unsigned dir_ino) +{ + int retval; + + ent->parent_de->inode = cpu_to_le32(dir_ino); + BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); + if (!ent->dir_inlined) { + if (is_dx(ent->inode)) { + retval = ext4_handle_dirty_dx_node(handle, + ent->inode, + ent->dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + ent->inode, + ent->dir_bh); + } + } else { + retval = ext4_mark_inode_dirty(handle, ent->inode); + } + if (retval) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + int retval; + + BUFFER_TRACE(ent->bh, "get write access"); + retval = ext4_journal_get_write_access(handle, ent->bh); + if (retval) + return retval; + ent->de->inode = cpu_to_le32(ino); + if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + ent->de->file_type = file_type; + ent->dir->i_version++; + ent->dir->i_ctime = ent->dir->i_mtime = + ext4_current_time(ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); + if (!ent->inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + ent->dir, ent->bh); + if (unlikely(retval)) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + } + brelse(ent->bh); + ent->bh = NULL; + + return 0; +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (bh) { + retval = ext4_delete_entry(handle, dir, de, bh); + brelse(bh); + } + return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + /* + * ent->de could have moved from under us during htree split, so make + * sure that we are deleting the right entry. We might also be pointing + * to a stale entry in the unused part of ent->bh so just checking inum + * and the name isn't enough. + */ + if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || + ent->de->name_len != ent->dentry->d_name.len || + strncmp(ent->de->name, ent->dentry->d_name.name, + ent->de->name_len)) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } else { + retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); + if (retval == -ENOENT) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } + } + + if (retval) { + ext4_warning(ent->dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + ent->dir->i_ino, ent->dir->i_nlink, retval); + } +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ + if (ent->dir_nlink_delta) { + if (ent->dir_nlink_delta == -1) + ext4_dec_count(handle, ent->dir); + else + ext4_inc_count(handle, ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + } +} /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - handle_t *handle; - struct inode *old_inode, *new_inode; - struct buffer_head *old_bh, *new_bh, *dir_bh; - struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; - - dquot_initialize(old_dir); - dquot_initialize(new_dir); + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; + int retval; - old_bh = new_bh = dir_bh = NULL; + dquot_initialize(old.dir); + dquot_initialize(new.dir); /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, 2 * - EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); + if (new.inode) + dquot_initialize(new.inode); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); - new_bh = NULL; + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (new.bh) { + if (!new.inode) { + brelse(new.bh); + new.bh = NULL; } } - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { + if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old.inode); + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new_inode)) + if (!empty_dir(new.inode)) + goto end_rename; + } else { + retval = -EMLINK; + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = -EIO; - dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir != old_dir && - EXT4_DIR_LINK_MAX(new_dir)) - goto end_rename; - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, dir_bh); + retval = ext4_rename_dir_prepare(handle, &old); if (retval) goto end_rename; } - if (!new_bh) { - retval = ext4_add_entry(handle, new_dentry, old_inode); + if (!new.bh) { + retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext4_journal_get_write_access(handle, new_bh); + retval = ext4_setent(handle, &new, + old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = - ext4_current_time(new_dir); - ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; - } - brelse(new_bh); - new_bh = NULL; } /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = ext4_current_time(old_inode); - ext4_mark_inode_dirty(handle, old_inode); + old.inode->i_ctime = ext4_current_time(old.inode); + ext4_mark_inode_dirty(handle, old.inode); /* * ok, that's it */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext4_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; - - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } + ext4_rename_delete(handle, &old); + + if (new.inode) { + ext4_dec_count(handle, new.inode); + new.inode->i_ctime = ext4_current_time(new.inode); } - if (retval) { - ext4_warning(old_dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - ext4_dec_count(handle, new_inode); - new_inode->i_ctime = ext4_current_time(new_inode); - } - old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); - ext4_update_dx_flag(old_dir); - if (dir_bh) { - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); - if (retval) { - ext4_std_error(old_dir->i_sb, retval); + old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + ext4_update_dx_flag(old.dir); + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) goto end_rename; - } - ext4_dec_count(handle, old_dir); - if (new_inode) { + + ext4_dec_count(handle, old.dir); + if (new.inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - clear_nlink(new_inode); + clear_nlink(new.inode); } else { - ext4_inc_count(handle, new_dir); - ext4_update_dx_flag(new_dir); - ext4_mark_inode_dirty(handle, new_dir); + ext4_inc_count(handle, new.dir); + ext4_update_dx_flag(new.dir); + ext4_mark_inode_dirty(handle, new.dir); } } - ext4_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext4_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; + ext4_mark_inode_dirty(handle, old.dir); + if (new.inode) { + ext4_mark_inode_dirty(handle, new.inode); + if (!new.inode->i_nlink) + ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: - brelse(dir_bh); - brelse(old_bh); - brelse(new_bh); - ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); + brelse(old.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; + u8 new_file_type; + int retval; + + dquot_initialize(old.dir); + dquot_initialize(new.dir); + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, + &old.de, &old.inlined); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + + /* RENAME_EXCHANGE case: old *and* new must both exist */ + if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) + goto end_rename; + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + old.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &old); + if (retval) + goto end_rename; + } + if (S_ISDIR(new.inode->i_mode)) { + new.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &new); + if (retval) + goto end_rename; + } + + /* + * Other than the special case of overwriting a directory, parents' + * nlink only needs to be modified if this is a cross directory rename. + */ + if (old.dir != new.dir && old.is_dir != new.is_dir) { + old.dir_nlink_delta = old.is_dir ? -1 : 1; + new.dir_nlink_delta = -old.dir_nlink_delta; + retval = -EMLINK; + if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || + (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) + goto end_rename; + } + + new_file_type = new.de->file_type; + retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); + if (retval) + goto end_rename; + + retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); + if (retval) + goto end_rename; + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old.inode->i_ctime = ext4_current_time(old.inode); + new.inode->i_ctime = ext4_current_time(new.inode); + ext4_mark_inode_dirty(handle, old.inode); + ext4_mark_inode_dirty(handle, new.inode); + + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + } + if (new.dir_bh) { + retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); + if (retval) + goto end_rename; + } + ext4_update_dir_count(handle, &old); + ext4_update_dir_count(handle, &new); + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(new.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); return retval; } +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return ext4_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" + * is equivalent to regular rename. + */ + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); +} + /* * directories can handle most operations... */ @@ -2577,25 +3454,25 @@ const struct inode_operations ext4_dir_inode_operations = { .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, + .tmpfile = ext4_tmpfile, .rename = ext4_rename, + .rename2 = ext4_rename2, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 47585189651..b24a2541a9b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,87 +18,146 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/mm.h> +#include <linux/ratelimit.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" -static struct kmem_cache *io_page_cachep, *io_end_cachep; +static struct kmem_cache *io_end_cachep; int __init ext4_init_pageio(void) { - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); + if (io_end_cachep == NULL) return -ENOMEM; - } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) { - wait_queue_head_t *wq = ext4_ioend_wq(inode); + char b[BDEVNAME_SIZE]; + printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio) +{ + int i; + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + if (!page) + continue; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); + } } -static void put_io_page(struct ext4_io_page *io_page) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); + struct bio *bio, *next_bio; + + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); } + kmem_cache_free(io_end_cachep, io_end); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - int i; + struct inode *inode = io_end->inode; - BUG_ON(!io); - if (io->page) - put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } /* - * check a range of space and convert unwritten extents to written. - * - * Called with inode->i_mutex; we depend on this when we manipulate - * io->flag, since we could otherwise race with ext4_flush_completed_IO() + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). */ -int ext4_end_io_nolock(ext4_io_end_t *io) +static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + handle_t *handle = io->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -106,56 +165,87 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); + return ret; +} - if (io->iocb) - aio_complete(io->iocb, io->result, 0); +static void dump_completed_IO(struct inode *inode, struct list_head *head) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); - return ret; + if (list_empty(head)) + return; + + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif } -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) +/* Add the io_end to per-inode completed end_io list. */ +static void ext4_add_complete_io(ext4_io_end_t *io_end) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); + struct workqueue_struct *wq; + unsigned long flags; + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&io->list)) { - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - goto free; - } + wq = sbi->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} - if (!mutex_trylock(&inode->i_mutex)) { - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - /* - * Requeue the work instead of waiting so that the work - * items queued after this can be processed. - */ - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); - /* - * To prevent the ext4-dio-unwritten thread from keeping - * requeueing end_io requests and occupying cpu for too long, - * yield the cpu if it sees an end_io request that has already - * been requeued. - */ - if (io->flag & EXT4_IO_END_QUEUED) - yield(); - io->flag |= EXT4_IO_END_QUEUED; - return; - } - list_del_init(&io->list); +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) +{ + ext4_io_end_t *io; + struct list_head unwritten; + unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, ret = 0; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - (void) ext4_end_io_nolock(io); - mutex_unlock(&inode->i_mutex); -free: - ext4_free_io_end(io); + + while (!list_empty(&unwritten)) { + io = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io->list); + + err = ext4_end_io(io); + if (unlikely(!ret && err)) + ret = err; + } + return ret; +} + +/* + * work on completed IO, to convert unwritten extents to extents + */ +void ext4_end_io_rsv_work(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -164,98 +254,87 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) if (io) { atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; - INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) +void ext4_put_io_end_defer(ext4_io_end_t *io_end) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } } +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + +/* BIO completion function for page writeback */ static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct workqueue_struct *wq; - struct inode *inode; - unsigned long flags; - int i; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); - bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - bio_put(bio); - - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; - struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - put_io_page(io_end->pages[i]); - } - io_end->num_io_pages = 0; - inode = io_end->inode; if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + struct inode *inode = io_end->inode; + + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); } - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); } void ext4_io_submit(struct ext4_io_submit *io) @@ -269,158 +348,150 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + if (!bio) + return -ENOMEM; + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { - ext4_io_end_t *io_end; int ret; - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - - if (!buffer_mapped(bh) || buffer_delay(bh)) { - if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); - return 0; - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } + io->io_next_block++; return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc) + struct writeback_control *wbc, + bool keep_towrite) { struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; + unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; + int nr_submitted = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); - set_page_writeback(page); + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); ClearPageError(page); - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; + /* + * Comments copied from block_write_full_page: + * + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); if (block_start >= len) { - /* - * Comments copied from block_write_full_page_endio: - * - * The page straddles i_size. It must be zeroed out on - * each and every writepage invocation because it may - * be mmapped. "A file is mapped in multiples of the - * page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when - * mapped, and writes to that region are not written - * out to the file." - */ - zero_user_segment(page, block_start, block_end); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; } - clear_buffer_dirty(bh); - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (!buffer_dirty(bh) || buffer_delay(bh) || + !buffer_mapped(bh) || buffer_unwritten(bh)) { + /* A hole? We can safely clear the dirty bit */ + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + continue; + } + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + /* Now submit buffers to write */ + bh = head = page_buffers(page); + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else * we can do but mark the page as dirty, and * better luck next time. */ - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); break; } + nr_submitted++; + clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); } unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f9d948f0eb8..bb0e80f03e2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,29 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); +} + +static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, + ext4_group_t group) { + return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << + EXT4_DESC_PER_BLOCK_BITS(sb); +} + +static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, + ext4_group_t group) { + group = ext4_meta_bg_first_group(sb, group); + return ext4_group_first_block_no(sb, group); +} + +static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, + ext4_group_t group) { + ext4_grpblk_t overhead; + overhead = ext4_bg_num_gdb(sb, group); + if (ext4_bg_has_super(sb, group)) + overhead += 1 + + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + return overhead; } #define outside(b, first, last) ((b) < (first) || (b) >= (last)) @@ -57,14 +79,20 @@ static int verify_group_input(struct super_block *sb, ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_bg_has_super(sb, group) ? - (1 + ext4_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - ext4_fsblk_t metaend = start + overhead; + unsigned overhead; + ext4_fsblk_t metaend; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; int err = -EINVAL; + if (group != sbi->s_groups_count) { + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + return -EINVAL; + } + + overhead = ext4_group_overhead_blocks(sb, group); + metaend = start + overhead; input->free_blocks_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; @@ -76,10 +104,7 @@ static int verify_group_input(struct super_block *sb, free_blocks_count, input->reserved_blocks); ext4_get_group_no_and_offset(sb, start, NULL, &offset); - if (group != sbi->s_groups_count) - ext4_warning(sb, "Cannot add at group %u (only %u groups)", - input->group, sbi->s_groups_count); - else if (offset != 0) + if (offset != 0) ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext4_warning(sb, "Reserved blocks too high (%u)", @@ -161,6 +186,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + goto out2; flex_gd->count = flexbg_size; flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * @@ -198,13 +225,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) * be a partial of a flex group. * * @sb: super block of fs to which the groups belongs + * + * Returns 0 on a successful allocation of the metadata blocks in the + * block group. */ -static void ext4_alloc_group_tables(struct super_block *sb, +static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_fsblk_t start_blk; ext4_fsblk_t last_blk; ext4_group_t src_group; @@ -214,6 +243,7 @@ static void ext4_alloc_group_tables(struct super_block *sb, ext4_group_t group; ext4_group_t last_group; unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -224,34 +254,34 @@ static void ext4_alloc_group_tables(struct super_block *sb, (last_group & ~(flexbg_size - 1)))); next_group: group = group_data[0].group; + if (src_group >= group_data[0].group + flex_gd->count) + return -ENOSPC; start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; - overhead = ext4_bg_has_super(sb, src_group) ? - (1 + ext4_bg_num_gdb(sb, src_group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, src_group); start_blk += overhead; - BUG_ON(src_group >= group_data[0].group + flex_gd->count); /* We collect contiguous blocks as much as possible. */ src_group++; - for (; src_group <= last_group; src_group++) - if (!ext4_bg_has_super(sb, src_group)) + for (; src_group <= last_group; src_group++) { + overhead = ext4_group_overhead_blocks(sb, src_group); + if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; + } /* Allocate block bitmaps */ for (; bb_index < flex_gd->count; bb_index++) { if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ @@ -259,25 +289,33 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { - if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; - group_data[group].free_blocks_count -= - EXT4_SB(sb)->s_itb_per_group; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].free_blocks_count -= overhead; + itb -= overhead; + } + + group_data[group].free_blocks_count -= itb; + flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } @@ -298,6 +336,7 @@ next_group: group_data[i].free_blocks_count); } } + return 0; } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, @@ -307,8 +346,9 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -366,11 +406,11 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_group_t group; int err; - ext4_get_group_no_and_offset(sb, block, &group, NULL); + group = ext4_get_group_number(sb, block); start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; - count2 = sb->s_blocksize * 8 - (block - start); + count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); if (count2 > count) count2 = count; @@ -384,9 +424,10 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); - if (!bh) - return -EIO; + if (unlikely(!bh)) + return -ENOMEM; + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) return err; @@ -431,26 +472,41 @@ static int setup_new_flex_group_blocks(struct super_block *sb, ext4_group_t group, count; struct buffer_head *bh = NULL; int reserved_gdb, i, j, err = 0, err2; + int meta_bg; BUG_ON(!flex_gd->count || !group_data || group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); /* This transaction may be extended/restarted along the way */ - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) return PTR_ERR(handle); group = group_data[0].group; for (i = 0; i < flex_gd->count; i++, group++) { unsigned long gdblocks; + ext4_grpblk_t overhead; gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) + goto handle_itb; + + if (meta_bg == 1) { + ext4_group_t first_group; + first_group = ext4_meta_bg_first_group(sb, group); + if (first_group != group + 1 && + first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) + goto handle_itb; + } + + block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ - for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + for (j = 0; j < gdblocks; j++, block++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); @@ -459,11 +515,12 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto out; } + BUFFER_TRACE(gdb, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb); if (err) { brelse(gdb); @@ -491,6 +548,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; } +handle_itb: /* Initialize group tables of the grop @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; @@ -519,11 +577,11 @@ handle_bb: err = PTR_ERR(bh); goto out; } - if (ext4_bg_has_super(sb, group)) { + overhead = ext4_group_overhead_blocks(sb, group); + if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + - 1); + ext4_set_bits(bh->b_data, 0, overhead); } ext4_mark_bitmap_end(group_data[i].blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -573,7 +631,7 @@ handle_ib: if (err) goto out; count = group_table_count[j]; - start = group_data[i].block_bitmap; + start = (&group_data[i].block_bitmap)[j]; block = start; } @@ -735,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) goto exit_dind; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_sbh; + goto exit_dind; + BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) ext4_std_error(sb, err); @@ -750,7 +811,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dindj; + goto exit_dind; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -796,7 +857,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); @@ -804,12 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, exit_inode: ext4_kvfree(n_group_desc); - /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); -exit_dindj: - /* ext4_handle_release_buffer(handle, dind); */ -exit_sbh: - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -820,6 +876,46 @@ exit_bh: } /* + * add_new_gdb_meta_bg is the sister of add_new_gdb. + */ +static int add_new_gdb_meta_bg(struct super_block *sb, + handle_t *handle, ext4_group_t group) { + ext4_fsblk_t gdblock; + struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int err; + + gdblock = ext4_meta_bg_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + return err; + } + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree(o_group_desc); + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + brelse(gdb_bh); + return err; +} + +/* * Called when we are adding a new group which has a backup copy of each of * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. * We need to add these reserved backup GDT blocks to the resize inode, so @@ -888,14 +984,9 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext4_handle_release_buffer(handle, primary[j]); - */ + BUFFER_TRACE(primary[i], "get_write_access"); + if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; - } } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) @@ -947,29 +1038,38 @@ exit_free: * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) +static void update_backups(struct super_block *sb, int blk_off, char *data, + int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const ext4_group_t last = sbi->s_groups_count; + ext4_group_t last; const int bpg = EXT4_BLOCKS_PER_GROUP(sb); unsigned three = 1; unsigned five = 5; unsigned seven = 7; - ext4_group_t group; + ext4_group_t group = 0; int rest = sb->s_blocksize - size; handle_t *handle; int err = 0, err2; - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) { group = 1; err = PTR_ERR(handle); goto exit_err; } - while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + if (meta_bg == 0) { + group = ext4_list_backups(sb, &three, &five, &seven); + last = sbi->s_groups_count; + } else { + group = ext4_meta_bg_first_group(sb, group) + 1; + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); + } + + while (group < sbi->s_groups_count) { struct buffer_head *bh; + ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ if (ext4_handle_valid(handle) && @@ -978,13 +1078,21 @@ static void update_backups(struct super_block *sb, (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; - bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; + if (meta_bg == 0) + backup_block = group * bpg + blk_off; + else + backup_block = (ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group)); + + bh = sb_getblk(sb, backup_block); + if (unlikely(!bh)) { + err = -ENOMEM; break; } - ext4_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); + ext4_debug("update metadata backup %llu(+%llu)\n", + backup_block, backup_block - + ext4_group_first_block_no(sb, group)); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) break; lock_buffer(bh); @@ -997,6 +1105,13 @@ static void update_backups(struct super_block *sb, if (unlikely(err)) ext4_std_error(sb, err); brelse(bh); + + if (meta_bg == 0) + group = ext4_list_backups(sb, &three, &five, &seven); + else if (group == last) + break; + else + group = last; } if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -1039,7 +1154,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, struct ext4_super_block *es = sbi->s_es; struct buffer_head *gdb_bh; int i, gdb_off, gdb_num, err = 0; + int meta_bg; + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; @@ -1055,18 +1172,64 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, */ if (gdb_off) { gdb_bh = sbi->s_group_desc[gdb_num]; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); - } else + } else if (meta_bg != 0) { + err = add_new_gdb_meta_bg(sb, handle, group); + } else { err = add_new_gdb(handle, resize_inode, group); + } if (err) break; } return err; } +static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) +{ + struct buffer_head *bh = sb_getblk(sb, block); + if (unlikely(!bh)) + return NULL; + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + brelse(bh); + return NULL; + } + } + + return bh; +} + +static int ext4_set_bitmap_checksums(struct super_block *sb, + ext4_group_t group, + struct ext4_group_desc *gdp, + struct ext4_new_group_data *group_data) +{ + struct buffer_head *bh; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + bh = ext4_get_bitmap(sb, group_data->inode_bitmap); + if (!bh) + return -EIO; + ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + brelse(bh); + + bh = ext4_get_bitmap(sb, group_data->block_bitmap); + if (!bh) + return -EIO; + ext4_block_bitmap_csum_set(sb, group, gdp, bh); + brelse(bh); + + return 0; +} + /* * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg */ @@ -1093,18 +1256,27 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, */ gdb_bh = sbi->s_group_desc[gdb_num]; /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + if (err) { + ext4_std_error(sb, err); + break; + } + ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_B2C(sbi, group_data->free_blocks_count)); + EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + if (ext4_has_group_desc_csum(sb)) + ext4_itable_unused_set(sb, gdp, + EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { @@ -1158,14 +1330,18 @@ static void ext4_update_super(struct super_block *sb, } reserved_blocks = ext4_r_blocks_count(es) * 100; - do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); /* * We need to protect s_groups_count against other CPUs seeing * inconsistent state in the superblock. @@ -1188,6 +1364,8 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ @@ -1196,21 +1374,28 @@ static void ext4_update_super(struct super_block *sb, /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, free_blocks)); + EXT4_NUM_B2C(sbi, free_blocks)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + ext4_debug("free blocks count %llu", + percpu_counter_read(&sbi->s_freeclusters_counter)); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } + /* + * Update the fs overhead information + */ + ext4_calculate_overhead(sb); + if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, @@ -1252,12 +1437,13 @@ static int ext4_flex_group_add(struct super_block *sb, * modify each of the reserved GDT dindirect blocks. */ credit = flex_gd->count * 4 + reserved_gdb; - handle = ext4_journal_start_sb(sb, credit); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto exit; } + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto exit_journal; @@ -1283,16 +1469,24 @@ exit_journal: err = err2; if (!err) { - int i; + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int gdb_num_end = ((group + flex_gd->count - 1) / + EXT4_DESC_PER_BLOCK(sb)); + int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG); + sector_t old_gdb = 0; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); - for (i = 0; i < flex_gd->count; i++, group++) { + sizeof(struct ext4_super_block), 0); + for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; - int gdb_num; - gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + if (old_gdb == gdb_bh->b_blocknr) + continue; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size); + gdb_bh->b_size, meta_bg); + old_gdb = gdb_bh->b_blocknr; } } exit: @@ -1336,21 +1530,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, group_data[i].group = group + i; group_data[i].blocks_count = blocks_per_group; - overhead = ext4_bg_has_super(sb, group + i) ? - (1 + ext4_bg_num_gdb(sb, group + i) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].free_blocks_count = blocks_per_group - overhead; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; - else + if (!test_opt(sb, INIT_INODE_TABLE)) + flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; + } else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } - if (last_group == n_group && - EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (last_group == n_group && ext4_has_group_desc_csum(sb)) /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; @@ -1384,11 +1575,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; struct inode *inode = NULL; - int gdb_off, gdb_num; + int gdb_off; int err; __u16 bg_flags = 0; - gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -1429,6 +1619,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (err) goto out; + err = ext4_alloc_flex_bg_array(sb, input->group + 1); + if (err) + goto out; + + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); + if (err) + goto out; + flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; @@ -1451,13 +1649,14 @@ static int ext4_group_extend_no_check(struct super_block *sb, /* We will update the superblock, one block bitmap, and * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, 3); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_warning(sb, "error %d on journal start", err); return err; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) { ext4_warning(sb, "error %d on journal write access", err); @@ -1465,6 +1664,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, } ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ @@ -1483,8 +1683,8 @@ errout: if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, + (char *)es, sizeof(struct ext4_super_block), 0); } return err; } @@ -1512,16 +1712,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", - o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT4-fs: filesystem on %s:" - " too large to resize to %llu blocks safely\n", - sb->s_id, n_blocks_count); + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); if (sizeof(sector_t) < 8) ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; @@ -1566,6 +1767,95 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, return err; } /* ext4_group_extend */ + +static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) +{ + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); +} + +/* + * Release the resize inode and drop the resize_inode feature if there + * are no more reserved gdt blocks, and then convert the file system + * to enable meta_bg + */ +static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) +{ + handle_t *handle; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t nr; + int i, ret, err = 0; + int credits = 1; + + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); + if (inode) { + if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "Unexpected non-zero " + "s_reserved_gdt_blocks"); + return -EPERM; + } + + /* Do a quick sanity check of the resize inode */ + if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + goto invalid_resize_inode; + for (i = 0; i < EXT4_N_BLOCKS; i++) { + if (i == EXT4_DIND_BLOCK) { + if (ei->i_data[i]) + continue; + else + goto invalid_resize_inode; + } + if (ei->i_data[i]) + goto invalid_resize_inode; + } + credits += 3; /* block bitmap, bg descriptor, resize inode */ + } + + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto errout; + + EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + sbi->s_es->s_first_meta_bg = + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + + err = ext4_handle_dirty_super(handle, sb); + if (err) { + ext4_std_error(sb, err); + goto errout; + } + + if (inode) { + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + ei->i_data[EXT4_DIND_BLOCK] = 0; + inode->i_blocks = 0; + + err = ext4_mark_inode_dirty(handle, inode); + if (err) + ext4_std_error(sb, err); + } + +errout: + ret = ext4_journal_stop(handle); + if (!err) + err = ret; + return ret; + +invalid_resize_inode: + ext4_error(sb, "corrupted/inconsistent resize inode"); + return -EINVAL; +} + /* * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count * @@ -1578,21 +1868,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *bh; - struct inode *resize_inode; - ext4_fsblk_t o_blocks_count; - ext4_group_t o_group; - ext4_group_t n_group; - ext4_grpblk_t offset; + struct inode *resize_inode = NULL; + ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; - unsigned long desc_blocks; - int err = 0, flexbg_size = 1; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_fsblk_t o_blocks_count; + ext4_fsblk_t n_blocks_count_retry = 0; + unsigned long last_update_time = 0; + int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int meta_bg; + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + +retry: o_blocks_count = ext4_blocks_count(es); - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1604,51 +1904,77 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) /* Nothing need to do */ return 0; - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); - ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); - n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / - EXT4_DESC_PER_BLOCK(sb); - o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - desc_blocks = n_desc_blocks - o_desc_blocks; + n_desc_blocks = num_desc_blocks(sb, n_group + 1); + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); - if (desc_blocks && - (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || - le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { - ext4_warning(sb, "No reserved GDT blocks, can't resize"); - return -EPERM; - } + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); - if (IS_ERR(resize_inode)) { - ext4_warning(sb, "Error opening resize inode"); - return PTR_ERR(resize_inode); + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (meta_bg) { + ext4_error(sb, "resize_inode and meta_bg enabled " + "simultaneously"); + return -EINVAL; + } + if (n_desc_blocks > o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks)) { + n_blocks_count_retry = n_blocks_count; + n_desc_blocks = o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks); + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); + n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_group--; /* set to last group number */ + } + + if (!resize_inode) + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } } - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, n_blocks_count - 1); - if (!bh) { - ext4_warning(sb, "can't read last block, resize aborted"); - return -ENOSPC; + if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + err = ext4_convert_meta_bg(sb, resize_inode); + if (err) + goto out; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + if (n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + goto retry; + } } - brelse(bh); - if (offset != 0) { - /* extend the last group */ - ext4_grpblk_t add; - add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - es->s_log_groups_per_flex) - flexbg_size = 1 << es->s_log_groups_per_flex; + if (ext4_blocks_count(es) == n_blocks_count) + goto out; + + err = ext4_alloc_flex_bg_array(sb, n_group + 1); + if (err) + return err; - o_blocks_count = ext4_blocks_count(es); - if (o_blocks_count == n_blocks_count) + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); + if (err) goto out; flex_gd = alloc_flex_gd(flexbg_size); @@ -1662,19 +1988,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, flexbg_size)) { - ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + if (jiffies - last_update_time > HZ * 10) { + if (last_update_time) + ext4_msg(sb, KERN_INFO, + "resized to %llu blocks", + ext4_blocks_count(es)); + last_update_time = jiffies; + } + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) + break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); if (unlikely(err)) break; } + if (!err && n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + free_flex_gd(flex_gd); + flex_gd = NULL; + goto retry; + } + out: if (flex_gd) free_flex_gd(flex_gd); - - iput(resize_inode); - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + if (resize_inode != NULL) + iput(resize_inode); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 502c61fd739..6df7bc611db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,7 +45,7 @@ #include <linux/freezer.h> #include "ext4.h" -#include "ext4_extents.h" +#include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -59,21 +59,21 @@ static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]); +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); -static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); @@ -83,6 +83,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { @@ -92,6 +93,8 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) @@ -106,16 +109,61 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) #else #define IS_EXT3_SB(sb) (0) #endif +static int ext4_verify_csum_type(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum_type == EXT4_CRC32C_CHKSUM; +} + +static __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct ext4_super_block, s_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, ~0, (char *)es, offset); + + return cpu_to_le32(csum); +} + +static int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum == ext4_superblock_csum(sb, es); +} + +void ext4_superblock_csum_set(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + es->s_checksum = ext4_superblock_csum(sb, es); +} + void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; - ret = kmalloc(size, flags); + ret = kmalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags, PAGE_KERNEL); return ret; @@ -125,7 +173,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) { void *ret; - ret = kzalloc(size, flags); + ret = kzalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); return ret; @@ -253,134 +301,6 @@ void ext4_itable_unused_set(struct super_block *sb, } -/* Just increment the non-pointer handle value */ -static handle_t *ext4_get_nojournal(void) -{ - handle_t *handle = current->journal_info; - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); - - ref_cnt++; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; - return handle; -} - - -/* Decrement the non-pointer handle value */ -static void ext4_put_nojournal(handle_t *handle) -{ - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt == 0); - - ref_cnt--; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; -} - -/* - * Wrappers for jbd2_journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer. - */ -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - handle_t *handle; - - trace_ext4_journal_start(sb, nblocks, _RET_IP_); - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - journal = EXT4_SB(sb)->s_journal; - handle = ext4_journal_current_handle(); - - /* - * If a handle has been started, it should be allowed to - * finish, otherwise deadlock could happen between freeze - * and others(e.g. truncate) due to the restart of the - * journal handle if the filesystem is forzen and active - * handles are not stopped. - */ - if (!handle) - vfs_check_frozen(sb, SB_FREEZE_TRANS); - - if (!journal) - return ext4_get_nojournal(); - /* - * Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. - */ - if (is_journal_aborted(journal)) { - ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - return jbd2_journal_start(journal, nblocks); -} - -/* - * The only special thing we need to do here is to make sure that all - * jbd2_journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - if (!ext4_handle_valid(handle)) { - ext4_put_nojournal(handle); - return 0; - } - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = jbd2_journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext4_std_error(sb, where, line, err); - return err; -} - -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext4_decode_error(NULL, err, nbuf); - - BUG_ON(!ext4_handle_valid(handle)); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", - caller, line, errstr, err_fn); - - jbd2_journal_abort_handle(handle); -} - static void __save_error_info(struct super_block *sb, const char *func, unsigned int line) { @@ -405,7 +325,7 @@ static void __save_error_info(struct super_block *sb, const char *func, */ if (!es->s_error_count) mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); + le32_add_cpu(&es->s_error_count, 1); } static void save_error_info(struct super_block *sb, const char *func, @@ -431,6 +351,25 @@ static int block_device_ejected(struct super_block *sb) return bdi->dev == NULL; } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce; + + BUG_ON(txn->t_state == T_FINISHED); + spin_lock(&sbi->s_md_lock); + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -461,6 +400,11 @@ static void ext4_handle_error(struct super_block *sb) } if (test_opt(sb, ERRORS_RO)) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } if (test_opt(sb, ERRORS_PANIC)) @@ -468,25 +412,32 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } +#define ext4_error_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ + "EXT4-fs error") + void __ext4_error(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", - sb->s_id, function, line, current->comm, &vaf); - va_end(args); - + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + } + save_error_info(sb, function, line); ext4_handle_error(sb); } -void ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; @@ -494,52 +445,65 @@ void ext4_error_inode(struct inode *inode, const char *function, es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); + if (ext4_error_ratelimit(inode->i_sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); + } save_error_info(inode->i_sb, function, line); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); - if (block) - printk(KERN_CONT "block %llu: ", block); - printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); - va_end(args); - ext4_handle_error(inode->i_sb); } -void ext4_error_file(struct file *file, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; struct ext4_super_block *es; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); char pathname[80], *path; es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); + if (ext4_error_ratelimit(inode->i_sb)) { + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); + va_end(args); + } save_error_info(inode->i_sb, function, line); - path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); - if (block) - printk(KERN_CONT "block %llu: ", block); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); - va_end(args); - ext4_handle_error(inode->i_sb); } -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]) +const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) { char *errstr = NULL; @@ -588,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function, (sb->s_flags & MS_RDONLY)) return; - errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - save_error_info(sb, function, line); + if (ext4_error_ratelimit(sb)) { + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -621,8 +587,13 @@ void __ext4_abort(struct super_block *sb, const char *function, if ((sb->s_flags & MS_RDONLY) == 0) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); @@ -631,11 +602,15 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +void __ext4_msg(struct super_block *sb, + const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; + if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) + return; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -649,6 +624,10 @@ void __ext4_warning(struct super_block *sb, const char *function, struct va_format vaf; va_list args; + if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning")) + return; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -672,18 +651,20 @@ __acquires(bitlock) es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", - sb->s_id, function, line, grp); - if (ino) - printk(KERN_CONT "inode %lu: ", ino); - if (block) - printk(KERN_CONT "block %llu:", (unsigned long long) block); - printk(KERN_CONT "%pV\n", &vaf); - va_end(args); + if (ext4_error_ratelimit(sb)) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", + (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + } if (test_opt(sb, ERRORS_CONT)) { ext4_commit_super(sb, 0); @@ -754,22 +735,19 @@ fail: /* * Release the journal device */ -static int ext4_blkdev_put(struct block_device *bdev) +static void ext4_blkdev_put(struct block_device *bdev) { - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext4_blkdev_put(bdev); + ext4_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -804,12 +782,8 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->dio_unwritten_wq); - destroy_workqueue(sbi->dio_unwritten_wq); - - lock_super(sb); - if (sb->s_dirt) - ext4_commit_super(sb, 1); + flush_workqueue(sbi->rsv_conversion_wq); + destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); @@ -818,7 +792,8 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } - del_timer(&sbi->s_err_report); + ext4_es_unregister_shrinker(sbi); + del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -827,9 +802,12 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, 1); } + if (!(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -842,6 +820,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -867,6 +846,10 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; @@ -874,9 +857,10 @@ static void ext4_put_super(struct super_block *sb) * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ - unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); kfree(sbi); } @@ -895,26 +879,31 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; - memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_lru_nr = 0; + ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; + ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif ei->jinode = NULL; - INIT_LIST_HEAD(&ei->i_completed_io_list); + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); - ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); - atomic_set(&ei->i_aiodio_unwritten, 0); + atomic_set(&ei->i_unwritten, 0); + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); return &ei->vfs_inode; } @@ -952,14 +941,12 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); -#endif init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), @@ -973,15 +960,22 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -990,180 +984,6 @@ void ext4_clear_inode(struct inode *inode) } } -static inline void ext4_show_quota_options(struct seq_file *seq, - struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ - int def_errors; - unsigned long def_mount_opts; - struct super_block *sb = root->d_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - def_errors = le16_to_cpu(es->s_errors); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%llu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT4_DEF_RESUID || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); - } - if (sbi->s_resgid != EXT4_DEF_RESGID || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); - } - if (test_opt(sb, ERRORS_RO)) { - if (def_errors == EXT4_ERRORS_PANIC || - def_errors == EXT4_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER)) - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { - seq_printf(seq, ",min_batch_time=%u", - (unsigned) sbi->s_min_batch_time); - } - if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { - seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_max_batch_time); - } - - /* - * We're changing the default of barrier mount option, so - * let's always display its mount state so it's clear what its - * status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - seq_puts(seq, ",journal_async_commit"); - else if (test_opt(sb, JOURNAL_CHECKSUM)) - seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, I_VERSION)) - seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC) && - !(def_mount_opts & EXT4_DEFM_NODELALLOC)) - seq_puts(seq, ",nodelalloc"); - - if (!test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",nomblk_io_submit"); - if (sbi->s_stripe) - seq_printf(seq, ",stripe=%lu", sbi->s_stripe); - /* - * journal mode get enabled in different ways - * So just print the value even if we didn't specify it - */ - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - - if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) - seq_printf(seq, ",inode_readahead_blks=%u", - sbi->s_inode_readahead_blks); - - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",noauto_da_alloc"); - - if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) - seq_puts(seq, ",discard"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - if (test_opt(sb, DIOREAD_NOLOCK)) - seq_puts(seq, ",dioread_nolock"); - - if (test_opt(sb, BLOCK_VALIDITY) && - !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) - seq_puts(seq, ",block_validity"); - - if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_itable"); - else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_itable=%u", - (unsigned) sbi->s_li_wait_mult); - - ext4_show_quota_options(seq, sb); - - return 0; -} - static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1238,12 +1058,18 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, struct path *path); +static int ext4_quota_on_sysfile(struct super_block *sb, int type, + int format_id); static int ext4_quota_off(struct super_block *sb, int type); +static int ext4_quota_off_sysfile(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags); +static int ext4_enable_quotas(struct super_block *sb); static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, @@ -1265,6 +1091,16 @@ static const struct quotactl_ops ext4_qctl_operations = { .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk }; + +static const struct quotactl_ops ext4_qctl_sysfile_operations = { + .quota_on_meta = ext4_quota_on_sysfile, + .quota_off = ext4_quota_off_sysfile, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; #endif static const struct super_operations ext4_sops = { @@ -1295,7 +1131,7 @@ static const struct super_operations ext4_nojournal_sops = { .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, - .write_super = ext4_write_super, + .sync_fs = ext4_sync_fs_nojournal, .put_super = ext4_put_super, .statfs = ext4_statfs, .remount_fs = ext4_remount, @@ -1316,23 +1152,23 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_dev, - Opt_journal_checksum, Opt_journal_async_commit, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, }; static const match_table_t tokens = { @@ -1350,21 +1186,21 @@ static const match_table_t tokens = { {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_noload, "noload"}, {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, {Opt_commit, "commit=%u"}, {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_update, "journal=update"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, @@ -1389,11 +1225,10 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, - {Opt_mblk_io_submit, "mblk_io_submit"}, - {Opt_nomblk_io_submit, "nomblk_io_submit"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1408,6 +1243,12 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ {Opt_err, NULL}, }; @@ -1443,37 +1284,46 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; + int ret = -1; if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); - return 0; + return -1; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; } qname = match_strdup(args); if (!qname) { ext4_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return 0; + return -1; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); - kfree(qname); - return 0; + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 1; + else + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext4_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - return 0; + goto errout; } + sbi->s_qf_names[qtype] = qname; set_opt(sb, QUOTA); return 1; +errout: + kfree(qname); + return ret; } static int clear_qf_name(struct super_block *sb, int qtype) @@ -1485,433 +1335,365 @@ static int clear_qf_name(struct super_block *sb, int qtype) sbi->s_qf_names[qtype]) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); - return 0; + return -1; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ + kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 1; } #endif +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 +#ifdef CONFIG_QUOTA +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT +#endif +#define MOPT_DATAJ 0x0080 +#define MOPT_NO_EXT2 0x0100 +#define MOPT_NO_EXT3 0x0200 +#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_STRING 0x0400 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_resuid, 0, MOPT_GTE0}, + {Opt_resgid, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_STRING}, + {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, + MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, +#else + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, +#endif + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + kuid_t uid; + kgid_t gid; + int arg = 0; + +#ifdef CONFIG_QUOTA + if (token == Opt_usrjquota) + return set_qf_name(sb, USRQUOTA, &args[0]); + else if (token == Opt_grpjquota) + return set_qf_name(sb, GRPQUOTA, &args[0]); + else if (token == Opt_offusrjquota) + return clear_qf_name(sb, USRQUOTA); + else if (token == Opt_offgrpjquota) + return clear_qf_name(sb, GRPQUOTA); +#endif + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) + if (token == m->token) + break; + + if (m->token == Opt_err) { + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; + } + + if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext2", opt); + return -1; + } + if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext3", opt); + return -1; + } + + if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) + return -1; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks must be " + "0 or a power of 2 smaller than 2^31"); + return -1; + } + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (token == Opt_resuid) { + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); + return -1; + } + sbi->s_resuid = uid; + } else if (token == Opt_resgid) { + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); + return -1; + } + sbi->s_resgid = gid; + } else if (token == Opt_journal_dev) { + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + } else if (token == Opt_journal_path) { + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext4_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return -1; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return -1; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return -1; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + } else if (token == Opt_journal_ioprio) { + if (arg > 7) { + ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + " (must be 0-7)"); + return -1; + } + *journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (m->flags & MOPT_DATAJ) { + if (is_remount) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { + ext4_msg(sb, KERN_ERR, + "Cannot change data mode on remount"); + return -1; + } + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= m->mount_opt; + } +#ifdef CONFIG_QUOTA + } else if (m->flags & MOPT_QFMT) { + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, + "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } + sbi->s_jquota_fmt = m->mount_opt; +#endif + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; + } + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; + } + return 1; +} + static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, unsigned int *journal_ioprio, - ext4_fsblk_t *n_blocks_count, int is_remount) + int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *p; substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; -#ifdef CONFIG_QUOTA - int qfmt; -#endif + int token; if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { - int token; if (!*p) continue; - /* * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, MINIX_DF); - break; - case Opt_minix_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, MINIX_DF); - - break; - case Opt_grpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, GRPID); - - break; - case Opt_nogrpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, GRPID); - - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resuid = option; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resgid = option; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_RO); - set_opt(sb, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt(sb, ERRORS_RO); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt(sb, NO_UID32); - break; - case Opt_debug: - set_opt(sb, DEBUG); - break; - case Opt_oldalloc: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; -#ifdef CONFIG_EXT4_FS_XATTR - case Opt_user_xattr: - set_opt(sb, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sb, XATTR_USER); - break; -#else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); - break; -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - case Opt_acl: - set_opt(sb, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sb, POSIX_ACL); - break; -#else - case Opt_acl: - case Opt_noacl: - ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); - break; -#endif - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; - } - set_opt(sb, UPDATE_JOURNAL); - break; - case Opt_journal_dev: - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_checksum: - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_journal_async_commit: - set_opt(sb, JOURNAL_ASYNC_COMMIT); - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_noload: - set_opt(sb, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_max_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = option; - break; - case Opt_min_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_min_batch_time = option; - break; - case Opt_data_journal: - data_opt = EXT4_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT4_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT4_MOUNT_WRITEBACK_DATA; - datacheck: - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != data_opt) { - ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return 0; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; - } - break; - case Opt_data_err_abort: - set_opt(sb, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sb, DATA_ERR_ABORT); - break; -#ifdef CONFIG_QUOTA - case Opt_usrjquota: - if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: - if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: - if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: - if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sb, QUOTA); - set_opt(sb, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sb, QUOTA); - set_opt(sb, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return 0; - } - clear_opt(sb, QUOTA); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext4_msg(sb, KERN_ERR, - "quota options not supported"); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext4_msg(sb, KERN_ERR, - "journaled quota options not supported"); - break; - case Opt_noquota: - break; -#endif - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - break; - case Opt_nobarrier: - clear_opt(sb, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sb, BARRIER); - else - clear_opt(sb, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext4_msg(sb, KERN_ERR, - "resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated nobh option"); - break; - case Opt_bh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated bh option"); - break; - case Opt_i_version: - set_opt(sb, I_VERSION); - sb->s_flags |= MS_I_VERSION; - break; - case Opt_nodelalloc: - clear_opt(sb, DELALLOC); - clear_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_mblk_io_submit: - set_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_nomblk_io_submit: - clear_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_stripe: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_stripe = option; - break; - case Opt_delalloc: - set_opt(sb, DELALLOC); - set_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_block_validity: - set_opt(sb, BLOCK_VALIDITY); - break; - case Opt_noblock_validity: - clear_opt(sb, BLOCK_VALIDITY); - break; - case Opt_inode_readahead_blks: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > (1 << 30)) - return 0; - if (option && !is_power_of_2(option)) { - ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return 0; - } - sbi->s_inode_readahead_blks = option; - break; - case Opt_journal_ioprio: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > 7) - break; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, - option); - break; - case Opt_noauto_da_alloc: - set_opt(sb, NO_AUTO_DA_ALLOC); - break; - case Opt_auto_da_alloc: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - clear_opt(sb, NO_AUTO_DA_ALLOC); - else - set_opt(sb,NO_AUTO_DA_ALLOC); - break; - case Opt_discard: - set_opt(sb, DISCARD); - break; - case Opt_nodiscard: - clear_opt(sb, DISCARD); - break; - case Opt_dioread_nolock: - set_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_dioread_lock: - clear_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_init_itable: - set_opt(sb, INIT_INODE_TABLE); - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = EXT4_DEF_LI_WAIT_MULT; - if (option < 0) - return 0; - sbi->s_li_wait_mult = option; - break; - case Opt_noinit_itable: - clear_opt(sb, INIT_INODE_TABLE); - break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " - "or missing value", p); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) return 0; - } } #ifdef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { + ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " + "feature is enabled"); + return 0; + } if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -1939,9 +1721,171 @@ set_qf_format: } } #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (blocksize < PAGE_CACHE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + return 0; + } + } return 1; } +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); +#endif +} + +static const char *token2str(int token) +{ + const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -1958,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); @@ -1999,34 +1943,54 @@ done: return res; } +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *new_groups; + int size; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); + new_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", + size / (int) sizeof(struct flex_groups)); + return -ENOMEM; + } + + if (sbi->s_flex_groups) { + memcpy(new_groups, sbi->s_flex_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups))); + ext4_kvfree(sbi->s_flex_groups); + } + sbi->s_flex_groups = new_groups; + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + return 0; +} + static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - ext4_group_t flex_group_count; ext4_group_t flex_group; - unsigned int groups_per_flex = 0; - size_t size; - int i; + int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - /* We allocate both existing and potentially added groups */ - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", - flex_group_count); + + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) goto failed; - } for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); @@ -2034,8 +1998,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } @@ -2045,43 +2009,69 @@ failed: return 0; } -__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, - struct ext4_group_desc *gdp) +static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) { + int offset; __u16 crc = 0; + __le32 le_group = cpu_to_le32(block_group); - if (sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - int offset = offsetof(struct ext4_group_desc, bg_checksum); - __le32 le_group = cpu_to_le32(block_group); - - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); - crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); - crc = crc16(crc, (__u8 *)gdp, offset); - offset += sizeof(gdp->bg_checksum); /* skip checksum */ - /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) - crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + /* Use new metadata_csum algorithm */ + __le16 save_csum; + __u32 csum32; + + save_csum = gdp->bg_checksum; + gdp->bg_checksum = 0; + csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + sizeof(le_group)); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, + sbi->s_desc_size); + gdp->bg_checksum = save_csum; + + crc = csum32 & 0xFFFF; + goto out; } + /* old crc16 code */ + offset = offsetof(struct ext4_group_desc, bg_checksum); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + +out: return cpu_to_le16(crc); } -int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, +int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && - (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), + block_group, gdp))) return 0; return 1; } +void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (!ext4_has_group_desc_csum(sb)) + return; + gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); +} + /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_group_t *first_not_zeroed) @@ -2136,7 +2126,7 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } ext4_lock_group(sb, i); - if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, @@ -2203,10 +2193,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } @@ -2242,17 +2234,22 @@ static void ext4_orphan_cleanup(struct super_block *sb, list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); dquot_initialize(inode); if (inode->i_nlink) { - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); + mutex_lock(&inode->i_mutex); + truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + mutex_unlock(&inode->i_mutex); nr_truncates++; } else { - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -2406,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, if (ext4_bg_has_super(sb, bg)) has_super = 1; + /* + * If we have a meta_bg fs with 1k blocks, group 0's GDT is at + * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled + * on modern mke2fs or blksize > 1k on older mke2fs) then we must + * compensate. + */ + if (sb->s_blocksize == 1024 && nr == 0 && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) + has_super++; + return (has_super + ext4_group_first_block_no(sb, bg)); } @@ -2453,20 +2460,21 @@ struct ext4_attr { ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, const char *, size_t); - int offset; + union { + int offset; + int deprecated_val; + } u; }; -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) { - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; + int ret; - return 0; + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; } static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, @@ -2503,28 +2511,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } -static ssize_t extent_cache_hits_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); -} - -static ssize_t extent_cache_misses_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); -} - static ssize_t inode_readahead_blks_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long t; + int ret; - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; - if (t && !is_power_of_2(t)) + if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2534,7 +2532,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, static ssize_t sbi_ui_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } @@ -2543,21 +2541,69 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); unsigned long t; + int ret; - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; *ui = t; return count; } +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + +static ssize_t sbi_deprecated_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); +} + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ - .offset = offsetof(struct ext4_sb_info, _elname), \ + .u = { \ + .offset = offsetof(struct ext4_sb_info, _elname),\ + }, \ } #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) @@ -2568,12 +2614,19 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) #define ATTR_LIST(name) &ext4_attr_##name.attr +#define EXT4_DEPRECATED_ATTR(_name, _val) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = sbi_deprecated_show, \ + .u = { \ + .deprecated_val = _val, \ + }, \ +} EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_RO_ATTR(extent_cache_hits); -EXT4_RO_ATTR(extent_cache_misses); +EXT4_RW_ATTR(reserved_clusters); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2583,14 +2636,21 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(extent_cache_hits), - ATTR_LIST(extent_cache_misses), + ATTR_LIST(reserved_clusters), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -2600,16 +2660,26 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), NULL, }; /* Features this copy of ext4 supports */ EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); +EXT4_INFO_ATTR(meta_bg_resize); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), NULL, }; @@ -2710,6 +2780,16 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "extents feature\n"); return 0; } + +#ifndef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } +#endif /* CONFIG_QUOTA */ return 1; } @@ -2727,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2744,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -2772,6 +2853,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; + sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2783,7 +2865,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) break; } - if (group == ngroups) + if (group >= ngroups) ret = 1; if (!ret) { @@ -2798,6 +2880,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } + sb_end_write(sb); return ret; } @@ -2945,7 +3028,7 @@ static int ext4_run_lazyinit_thread(void) ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; - printk(KERN_CRIT "EXT4: error %d creating inode table " + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; @@ -3000,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; - unsigned long rnd; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) @@ -3015,40 +3097,39 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - get_random_bytes(&rnd, sizeof(rnd)); - elr->lr_next_sched = jiffies + (unsigned long)rnd % - (EXT4_DEF_LI_MAX_START_DELAY * HZ); - + elr->lr_next_sched = jiffies + (prandom_u32() % + (EXT4_DEF_LI_MAX_START_DELAY * HZ)); return elr; } -static int ext4_register_li_request(struct super_block *sb, - ext4_group_t first_not_zeroed) +int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_li_request *elr; + struct ext4_li_request *elr = NULL; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; + mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; - return 0; + goto out; } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) - return 0; + goto out; elr = ext4_li_request_new(sb, first_not_zeroed); - if (!elr) - return -ENOMEM; - - mutex_lock(&ext4_li_mtx); + if (!elr) { + ret = -ENOMEM; + goto out; + } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); @@ -3096,6 +3177,202 @@ static void ext4_destroy_lazyinit_thread(void) kthread_stop(ext4_lazyinit_task); } +static int set_journal_csum_feature_set(struct super_block *sb) +{ + int ret = 1; + int compat, incompat; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + /* journal checksum v2 */ + compat = 0; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + } else { + /* journal checksum v1 */ + compat = JBD2_FEATURE_COMPAT_CHECKSUM; + incompat = 0; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + incompat); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + incompat); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V2); + } + + return ret; +} + +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ +static int count_overhead(struct super_block *sb, ext4_group_t grp, + char *buf) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_fsblk_t first_block, last_block, b; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int s, j, count = 0; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + sbi->s_itb_per_group + 2); + + first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + + (grp * EXT4_BLOCKS_PER_GROUP(sb)); + last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + b = ext4_block_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_bitmap(sb, gdp); + if (b >= first_block && b <= last_block) { + ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); + count++; + } + b = ext4_inode_table(sb, gdp); + if (b >= first_block && b + sbi->s_itb_per_group <= last_block) + for (j = 0; j < sbi->s_itb_per_group; j++, b++) { + int c = EXT4_B2C(sbi, b - first_block); + ext4_set_bit(c, buf); + count++; + } + if (i != grp) + continue; + s = 0; + if (ext4_bg_has_super(sb, grp)) { + ext4_set_bit(s++, buf); + count++; + } + for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { + ext4_set_bit(EXT4_B2C(sbi, s++), buf); + count++; + } + } + if (!count) + return 0; + return EXT4_CLUSTERS_PER_GROUP(sb) - + ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); +} + +/* + * Compute the overhead and stash it in sbi->s_overhead + */ +int ext4_calculate_overhead(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + ext4_fsblk_t overhead = 0; + char *buf = (char *) get_zeroed_page(GFP_KERNEL); + + if (!buf) + return -ENOMEM; + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are overhead + */ + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); + + /* + * Add the overhead found in each block group + */ + for (i = 0; i < ngroups; i++) { + int blks; + + blks = count_overhead(sb, i, buf); + overhead += blks; + if (blks) + memset(buf, 0, PAGE_SIZE); + cond_resched(); + } + /* Add the journal blocks as well */ + if (sbi->s_journal) + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + + sbi->s_overhead = overhead; + smp_wmb(); + free_page((unsigned long) buf); + return 0; +} + + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) +{ + ext4_fsblk_t resv_clusters; + + /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * unwritten extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3117,7 +3394,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int i; int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; - int err; + int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; @@ -3132,9 +3409,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto out_free_orig; } sb->s_fs_info = sbi; - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT4_DEF_RESUID; - sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) @@ -3145,6 +3420,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { @@ -3171,33 +3447,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) goto cantfind_ext4; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + /* Warn if metadata_csum and gdt_csum are both set. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + silent = 1; + goto cantfind_ext4; + } + + /* Load the checksum driver */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; + } + } + + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + silent = 1; + goto cantfind_ext4; + } + + /* Precompute checksum seed for all metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { - ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", - "2.6.38"); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); - } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ -#ifdef CONFIG_EXT4_FS_XATTR set_opt(sb, XATTR_USER); -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif - set_opt(sb, MBLK_IO_SUBMIT); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3216,8 +3527,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; @@ -3229,7 +3540,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb) && + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); @@ -3240,13 +3551,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, NULL, 0)) { + &journal_devnum, &journal_ioprio, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", sbi->s_es->s_mount_opts); } + sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, NULL, 0)) + &journal_ioprio, 0)) goto failed_mount; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -3260,22 +3572,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); + "both data=journal and dioread_nolock"); goto failed_mount; } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - goto failed_mount; - } - } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3287,6 +3590,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + goto failed_mount; + } + } + if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " @@ -3317,6 +3630,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -3341,7 +3655,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "Can't read superblock on 2nd try"); goto failed_mount; } - es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, @@ -3406,17 +3720,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; #else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif - sb->s_dirt = 1; + } } /* Handle clustersize */ @@ -3473,6 +3792,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); + /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. @@ -3484,7 +3807,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = err; goto failed_mount; } @@ -3534,12 +3856,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); + ret = -ENOMEM; goto failed_mount; } if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3572,6 +3899,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sbi); + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); if (!err) { @@ -3585,13 +3915,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!err) { err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } + if (!err) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); + } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } sbi->s_stripe = ext4_get_stripe_size(sbi); - sbi->s_max_writeback_mb_bump = 128; + sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode @@ -3604,14 +3937,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA - sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + sb->s_qcop = &ext4_qctl_sysfile_operations; + else + sb->s_qcop = &ext4_qctl_operations; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3644,26 +3979,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto no_journal; } - if (ext4_blocks_count(es) > 0xffffffffULL && + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto failed_mount_wq; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - jbd2_journal_clear_features(sbi->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto failed_mount_wq; } /* We have now updated the journal if required, so we can @@ -3694,6 +4020,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + /* * The journal may have updated the bg summary counts, so we * need to update the global counters. @@ -3707,15 +4035,36 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: + if (ext4_mballoc_ready) { + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; + } + } + + /* + * Get the # of file system overhead blocks from the + * superblock if present. + */ + if (es->s_overhead_clusters) + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + else { + err = ext4_calculate_overhead(sb); + if (err) + goto failed_mount_wq; + } + /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ - EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->dio_unwritten_wq) { - printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); - goto failed_mount_wq; + EXT4_SB(sb)->rsv_conversion_wq = + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->rsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + ret = -ENOMEM; + goto failed_mount4; } /* @@ -3735,15 +4084,15 @@ no_journal: iput(root); goto failed_mount4; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); ext4_msg(sb, KERN_ERR, "get root dentry failed"); ret = -ENOMEM; goto failed_mount4; } - ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { @@ -3770,6 +4119,13 @@ no_journal: "available"); } + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sb)); + goto failed_mount4a; + } + err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -3778,7 +4134,7 @@ no_journal: } ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); @@ -3796,6 +4152,16 @@ no_journal: if (err) goto failed_mount7; +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !(sb->s_flags & MS_RDONLY)) { + err = ext4_enable_quotas(sb); + if (err) + goto failed_mount8; + } +#endif /* CONFIG_QUOTA */ + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -3813,6 +4179,14 @@ no_journal: } else descr = "out journal"; + if (test_opt(sb, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); @@ -3820,6 +4194,11 @@ no_journal: if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ + ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); + ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + kfree(orig_data); return 0; @@ -3828,6 +4207,10 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +#ifdef CONFIG_QUOTA +failed_mount8: + kobject_del(&sbi->s_kobj); +#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -3840,20 +4223,23 @@ failed_mount4a: sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3: - del_timer(&sbi->s_err_report); + ext4_es_unregister_shrinker(sbi); + del_timer_sync(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -3861,7 +4247,10 @@ failed_mount2: brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -3876,7 +4265,7 @@ out_fail: kfree(sbi); out_free_orig: kfree(orig_data); - return ret; + return err ? err : ret; } /* @@ -3983,7 +4372,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { @@ -4010,7 +4399,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4091,15 +4480,6 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = jbd2_journal_update_format(journal); - if (err) { - ext4_msg(sb, KERN_ERR, "error updating journal"); - jbd2_journal_destroy(journal); - return err; - } - } - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { @@ -4182,8 +4562,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); - sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); + ext4_superblock_csum_set(sb); mark_buffer_dirty(sbh); if (sync) { error = sync_dirty_buffer(sbh); @@ -4264,6 +4644,7 @@ static void ext4_clear_journal_err(struct super_block *sb, ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); } } @@ -4274,39 +4655,62 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - vfs_check_frozen(sb, SB_FREEZE_TRANS); - ret = ext4_journal_force_commit(journal); - } - - return ret; -} - -static void ext4_write_super(struct super_block *sb) -{ - lock_super(sb); - ext4_commit_super(sb, 1); - unlock_super(sb); + return ext4_journal_force_commit(journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - flush_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->rsv_conversion_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); + /* + * Data writeback is possible w/o journal transaction, so barrier must + * being sent at the end of the function. But we can skip it if + * transaction_commit will do it for us. + */ + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(sbi->s_journal, target); + ret = jbd2_log_wait_commit(sbi->s_journal, target); + } + if (needs_barrier) { + int err; + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; } + + return ret; +} + +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) +{ + int ret = 0; + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + dquot_writeback_dquots(sb, -1); + if (wait && test_opt(sb, BARRIER)) + ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + return ret; } @@ -4315,9 +4719,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from - * the upper layer. + * state independently. It relies on upper layer to stop all data & metadata + * modifications. */ static int ext4_freeze(struct super_block *sb) { @@ -4344,7 +4747,7 @@ static int ext4_freeze(struct super_block *sb) EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on s_frozen to stop further updates */ + /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return error; } @@ -4358,11 +4761,9 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - lock_super(sb); /* Reset the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); - unlock_super(sb); return 0; } @@ -4372,8 +4773,8 @@ static int ext4_unfreeze(struct super_block *sb) struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA @@ -4386,7 +4787,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; int enable_quota = 0; @@ -4394,12 +4794,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA - int i; + int i, j; #endif char *orig_data = kstrdup(data, GFP_KERNEL); /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; @@ -4411,7 +4810,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + kfree(orig_data); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; @@ -4419,12 +4828,26 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &journal_ioprio, - &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + err = -EINVAL; + goto restore_opts; + } + } + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, "Abort forced by user"); @@ -4438,14 +4861,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > ext4_blocks_count(es)) { + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; } if (*flags & MS_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; @@ -4481,7 +4906,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); - if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), @@ -4514,8 +4939,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext4_group_extend(sb, es, n_blocks_count))) - goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; if (EXT4_HAS_INCOMPAT_FEATURE(sb, @@ -4542,19 +4965,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL) + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); + if (enable_quota) { + if (sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); + else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + err = ext4_enable_quotas(sb); + if (err) + goto restore_opts; + } + } #endif - unlock_super(sb); - if (enable_quota) - dquot_resume(sb, -1); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); @@ -4572,84 +5000,37 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); kfree(orig_data); return err; } -/* - * Note: calculating the overhead so we can be compatible with - * historical BSD practice is quite difficult in the face of - * clusters/bigalloc. This is because multiple metadata blocks from - * different block group can end up in the same allocation cluster. - * Calculating the exact overhead in the face of clustered allocation - * requires either O(all block bitmaps) in memory or O(number of block - * groups**2) in time. We will still calculate the superblock for - * older file systems --- and if we come across with a bigalloc file - * system with zero in s_overhead_clusters the estimate will be close to - * correct especially for very large cluster sizes --- but for newer - * file systems, it's better to calculate this figure once at mkfs - * time, and store it in the superblock. If the superblock value is - * present (even for non-bigalloc file systems), we will use it. - */ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct ext4_group_desc *gdp; + ext4_fsblk_t overhead = 0, resv_blocks; u64 fsid; s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); - if (test_opt(sb, MINIX_DF)) { - sbi->s_overhead_last = 0; - } else if (es->s_overhead_clusters) { - sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); - } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { - ext4_group_t i, ngroups = ext4_get_groups_count(sb); - ext4_fsblk_t overhead = 0; - - /* - * Compute the overhead (FS structures). This is constant - * for a given filesystem unless the number of block groups - * changes so we cache the previous value until it does. - */ - - /* - * All of the blocks before first_data_block are - * overhead - */ - overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); - - /* - * Add the overhead found in each block group - */ - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - overhead += ext4_num_overhead_clusters(sb, i, gdp); - cond_resched(); - } - sbi->s_overhead_last = overhead; - smp_wmb(); - sbi->s_blocks_last = ext4_blocks_count(es); - } + if (!test_opt(sb, MINIX_DF)) + overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = (ext4_blocks_count(es) - - EXT4_C2B(sbi, sbi->s_overhead_last)); + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); @@ -4676,7 +5057,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) static inline struct inode *dquot_to_inode(struct dquot *dquot) { - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) @@ -4686,7 +5067,7 @@ static int ext4_write_dquot(struct dquot *dquot) struct inode *inode; inode = dquot_to_inode(dquot); - handle = ext4_journal_start(inode, + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4702,7 +5083,7 @@ static int ext4_acquire_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4718,7 +5099,7 @@ static int ext4_release_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ @@ -4734,9 +5115,12 @@ static int ext4_release_dquot(struct dquot *dquot) static int ext4_mark_dquot_dirty(struct dquot *dquot) { + struct super_block *sb = dquot->dq_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* Are we journaling quotas? */ - if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { @@ -4750,7 +5134,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, 2); + handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -4813,6 +5197,76 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return dquot_quota_on(sb, type, format_id, path); } +static int ext4_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + int err; + struct inode *qf_inode; + unsigned long qf_inums[MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + }; + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)); + + if (!qf_inums[type]) + return -EPERM; + + qf_inode = ext4_iget(sb, qf_inums[type]); + if (IS_ERR(qf_inode)) { + ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + + return err; +} + +/* Enable usage tracking for all quota types. */ +static int ext4_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inums[MAXQUOTAS] = { + le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { + if (qf_inums[type]) { + err = ext4_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED); + if (err) { + ext4_warning(sb, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "e2fsck to fix.", type, err); + return err; + } + } + } + return 0; +} + +/* + * quota_on function that is used when QUOTA feature is set. + */ +static int ext4_quota_on_sysfile(struct super_block *sb, int type, + int format_id) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + return -EINVAL; + + /* + * USAGE was enabled at mount time. Only need to enable LIMITS now. + */ + return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED); +} + static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; @@ -4828,7 +5282,7 @@ static int ext4_quota_off(struct super_block *sb, int type) /* Update modification times of quota files when userspace can * start looking at them */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -4839,6 +5293,18 @@ out: return dquot_quota_off(sb, type); } +/* + * quota_off function that is used when QUOTA feature is set. + */ +static int ext4_quota_off_sysfile(struct super_block *sb, int type) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + return -EINVAL; + + /* Disable only the limits. */ + return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); +} + /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) @@ -4908,10 +5374,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; + BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); @@ -4924,16 +5390,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; ext4_mark_inode_dirty(handle, inode); } - mutex_unlock(&inode->i_mutex); return len; } @@ -4969,7 +5432,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -5002,7 +5464,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } @@ -5016,6 +5477,7 @@ static struct file_system_type ext4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext4"); static int __init ext4_init_feat_adverts(void) { @@ -5056,6 +5518,10 @@ static int __init ext4_init_fs(void) { int i, err; + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); + + /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { @@ -5063,15 +5529,22 @@ static int __init ext4_init_fs(void) init_waitqueue_head(&ext4__ioend_wq[i]); } - err = ext4_init_pageio(); + err = ext4_init_es(); if (err) return err; + + err = ext4_init_pageio(); + if (err) + goto out7; + err = ext4_init_system_zone(); if (err) goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) + if (!ext4_kset) { + err = -ENOMEM; goto out5; + } ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); @@ -5080,11 +5553,9 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) - goto out3; - - err = ext4_init_xattr(); - if (err) goto out2; + else + ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5094,18 +5565,15 @@ static int __init ext4_init_fs(void) if (err) goto out; - ext4_li_info = NULL; - mutex_init(&ext4_li_mtx); return 0; out: unregister_as_ext2(); unregister_as_ext3(); destroy_inodecache(); out1: - ext4_exit_xattr(); -out2: + ext4_mballoc_ready = 0; ext4_exit_mballoc(); -out3: +out2: ext4_exit_feat_adverts(); out4: if (ext4_proc_root) @@ -5115,6 +5583,9 @@ out5: ext4_exit_system_zone(); out6: ext4_exit_pageio(); +out7: + ext4_exit_es(); + return err; } @@ -5125,13 +5596,13 @@ static void __exit ext4_exit_fs(void) unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - ext4_exit_xattr(); ext4_exit_mballoc(); ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_es(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ed9354aff27..ff371193201 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93a00d89a22..e7387337060 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,11 +61,6 @@ #include "xattr.h" #include "acl.h" -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -82,11 +77,11 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); @@ -95,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *, static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); -static struct mb_cache *ext4_xattr_cache; - static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -113,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - &ext4_xattr_acl_access_handler, - &ext4_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, @@ -122,6 +115,59 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_mb_cache) + +static __le32 ext4_xattr_block_csum(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); + + save_csum = hdr->h_checksum; + hdr->h_checksum = 0; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, + EXT4_BLOCK_SIZE(inode->i_sb)); + + hdr->h_checksum = save_csum; + return cpu_to_le32(csum); +} + +static int ext4_xattr_block_csum_verify(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) + return 0; + return 1; +} + +static void ext4_xattr_block_csum_set(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); +} + +static inline int ext4_handle_dirty_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { @@ -156,14 +202,21 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) } static inline int -ext4_xattr_check_block(struct buffer_head *bh) +ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) { int error; + if (buffer_verified(bh)) + return 0; + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + return -EIO; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + if (!error) + set_buffer_verified(bh); return error; } @@ -213,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -220,20 +274,21 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); if (error == -EIO) @@ -255,7 +310,7 @@ cleanup: return error; } -static int +int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { @@ -314,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; + if (strlen(name) > 255) + return -ERANGE; + down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); @@ -356,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -363,20 +422,21 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = 0; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); error = -EIO; if (!bh) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -456,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) return; + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); ext4_handle_dirty_super(handle, sb); @@ -463,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -472,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, { struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); + BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; @@ -484,21 +547,37 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext4_handle_dirty_metadata(handle, inode, bh); + if (ce) + mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); + unlock_buffer(bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); } - unlock_buffer(bh); out: ext4_std_error(inode->i_sb, error); return; @@ -512,31 +591,17 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) { @@ -589,9 +654,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size. Just replace. */ s->here->e_value_size = cpu_to_le32(i->value_len); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } return 0; } @@ -630,9 +700,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear the pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } } } return 0; @@ -662,7 +737,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(bs->bh)) { + if (ext4_xattr_check_block(inode, bs->bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -695,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &bs->s; struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, bs->bh->b_blocknr); + BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) goto cleanup; @@ -719,15 +796,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); - ext4_xattr_cache_insert(bs->bh); + ext4_xattr_cache_insert(ext4_mb_cache, + bs->bh); } unlock_buffer(bs->bh); if (error == -EIO) goto bad_block; if (!error) - error = ext4_handle_dirty_metadata(handle, - inode, - bs->bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -735,7 +813,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; @@ -784,9 +861,11 @@ inserted: else { /* The old block is released after updating the inode. */ - error = dquot_alloc_block(inode, 1); + error = dquot_alloc_block(inode, + EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; + BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access(handle, new_bh); if (error) @@ -796,9 +875,9 @@ inserted: ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, - new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -824,7 +903,7 @@ inserted: * take i_data_sem because we will test * i_delalloc_reserved_flag in ext4_mb_new_blocks */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); up_read((&EXT4_I(inode)->i_data_sem)); @@ -834,28 +913,30 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { + error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); - error = -EIO; goto cleanup; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { unlock_buffer(new_bh); + error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, new_bh); + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, new_bh); if (error) goto cleanup; } @@ -879,7 +960,7 @@ cleanup: return error; cleanup_dquot: - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: @@ -890,14 +971,8 @@ bad_block: #undef header } -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; - -static int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; @@ -925,10 +1000,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -static int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { + error = ext4_try_to_evict_inline_data(handle, inode, + EXT4_XATTR_LEN(strlen(i->name) + + EXT4_XATTR_SIZE(i->value_len))); + if (error) + return error; + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; + error = ext4_xattr_set_entry(i, s); + } + if (error) + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; @@ -1084,9 +1196,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; + int credits = ext4_jbd2_credits_xattr(inode); retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { @@ -1144,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t min_offs, free; - int total_ino, total_blk; + int total_ino; void *base, *start, *end; int extra_isize = 0, error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1192,7 +1305,7 @@ retry: error = -EIO; if (!bh) goto cleanup; - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -1202,8 +1315,7 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); + free = ext4_xattr_free_space(first, &min_offs, base, NULL); if (free < new_extra_isize) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1266,6 +1378,9 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; + kfree(is); is = NULL; + kfree(bs); bs = NULL; + brelse(bh); goto retry; } error = -1; @@ -1408,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); if (!ce) { ea_bdebug(bh, "out of memory"); return; @@ -1486,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, hash); while (ce) { struct buffer_head *bh; @@ -1589,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -ext4_init_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(char *name) { - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); - if (!ext4_xattr_cache) - return -ENOMEM; - return 0; + return mb_cache_create(name, HASH_BUCKET_BITS); } -void -ext4_exit_xattr(void) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { - if (ext4_xattr_cache) - mb_cache_destroy(ext4_xattr_cache); - ext4_xattr_cache = NULL; + if (cache) + mb_cache_destroy(cache); } + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 25b7387ff18..29bedf5589f 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -21,13 +21,17 @@ #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ + __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ + /* id = inum if refcount=1, blknum otherwise */ + __u32 h_reserved[3]; /* zero right now */ }; struct ext4_xattr_ibody_header { @@ -63,12 +67,35 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -# ifdef CONFIG_EXT4_FS_XATTR +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler; extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -83,65 +110,19 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); - extern const struct xattr_handler *ext4_xattr_handlers[]; -# else /* CONFIG_EXT4_FS_XATTR */ - -static inline int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext4_xattr_put_super(struct super_block *sb) -{ -} - -static __init inline int -ext4_init_xattr(void) -{ - return 0; -} - -static inline void -ext4_exit_xattr(void) -{ -} - -static inline int -ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - return -EOPNOTSUPP; -} - -#define ext4_xattr_handlers NULL - -# endif /* CONFIG_EXT4_FS_XATTR */ +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, |
