aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig22
-rw-r--r--fs/ext4/Makefile5
-rw-r--r--fs/ext4/acl.c284
-rw-r--r--fs/ext4/acl.h13
-rw-r--r--fs/ext4/balloc.c817
-rw-r--r--fs/ext4/bitmap.c95
-rw-r--r--fs/ext4/block_validity.c55
-rw-r--r--fs/ext4/dir.c430
-rw-r--r--fs/ext4/ext4.h1228
-rw-r--r--fs/ext4/ext4_extents.h102
-rw-r--r--fs/ext4/ext4_jbd2.c179
-rw-r--r--fs/ext4/ext4_jbd2.h249
-rw-r--r--fs/ext4/extents.c4207
-rw-r--r--fs/ext4/extents_status.c1127
-rw-r--r--fs/ext4/extents_status.h146
-rw-r--r--fs/ext4/file.c517
-rw-r--r--fs/ext4/fsync.c180
-rw-r--r--fs/ext4/hash.c10
-rw-r--r--fs/ext4/ialloc.c745
-rw-r--r--fs/ext4/indirect.c1391
-rw-r--r--fs/ext4/inline.c2000
-rw-r--r--fs/ext4/inode.c5427
-rw-r--r--fs/ext4/ioctl.c456
-rw-r--r--fs/ext4/mballoc.c1992
-rw-r--r--fs/ext4/mballoc.h51
-rw-r--r--fs/ext4/migrate.c208
-rw-r--r--fs/ext4/mmp.c395
-rw-r--r--fs/ext4/move_extent.c648
-rw-r--r--fs/ext4/namei.c1953
-rw-r--r--fs/ext4/page-io.c560
-rw-r--r--fs/ext4/resize.c1721
-rw-r--r--fs/ext4/super.c3397
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/truncate.h43
-rw-r--r--fs/ext4/xattr.c362
-rw-r--r--fs/ext4/xattr.h105
-rw-r--r--fs/ext4/xattr_security.c37
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
39 files changed, 21081 insertions, 10082 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1f319..efea5d5c44c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,6 +2,8 @@ config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select JBD2
select CRC16
+ select CRYPTO
+ select CRYPTO_CRC32C
help
This is the next generation of the ext3 filesystem.
@@ -37,22 +39,9 @@ config EXT4_USE_FOR_EXT23
compiled kernel size by using one file system driver for
ext2, ext3, and ext4 file systems.
-config EXT4_FS_XATTR
- bool "Ext4 extended attributes"
- depends on EXT4_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext4.
-
config EXT4_FS_POSIX_ACL
bool "Ext4 POSIX Access Control Lists"
- depends on EXT4_FS_XATTR
+ depends on EXT4_FS
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -65,7 +54,7 @@ config EXT4_FS_POSIX_ACL
config EXT4_FS_SECURITY
bool "Ext4 Security Labels"
- depends on EXT4_FS_XATTR
+ depends on EXT4_FS
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
@@ -82,4 +71,5 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
- with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+ with a command such as:
+ echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6..0310fec2ee3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,8 +6,9 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
-ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ea..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -55,16 +55,23 @@ ext4_acl_from_disk(const void *value, size_t size)
case ACL_OTHER:
value = (char *)value +
sizeof(ext4_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
break;
case ACL_USER:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
case ACL_GROUP:
value = (char *)value + sizeof(ext4_acl_entry);
if ((char *)value > end)
goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
break;
default:
@@ -98,13 +105,19 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header);
for (n = 0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext4_acl_entry *entry = (ext4_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch (acl->a_entries[n].e_tag) {
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch (acl_e->e_tag) {
case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(ext4_acl_entry);
+ break;
case ACL_GROUP:
- entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
e += sizeof(ext4_acl_entry);
break;
@@ -131,7 +144,7 @@ fail:
*
* inode->i_mutex: don't care
*/
-static struct posix_acl *
+struct posix_acl *
ext4_get_acl(struct inode *inode, int type)
{
int name_index;
@@ -139,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -183,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext4_new_inode
*/
static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
@@ -191,19 +197,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
else {
- inode->i_mode = mode;
inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
if (error == 0)
@@ -238,19 +239,22 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+ handle_t *handle;
+ int error, retries = 0;
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return error;
- }
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- return -EAGAIN;
+ error = __ext4_set_acl(handle, inode, type, acl);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ return error;
}
/*
@@ -262,211 +266,23 @@ ext4_check_acl(struct inode *inode, int mask)
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
- if (S_ISDIR(inode->i_mode)) {
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_NOFS);
- error = -ENOMEM;
- if (!clone)
- goto cleanup;
-
- mode = inode->i_mode;
- error = posix_acl_create_masq(clone, &mode);
- if (error >= 0) {
- inode->i_mode = mode;
- if (error > 0) {
- /* This is an extended ACL */
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_ACCESS, clone);
- }
- }
- posix_acl_release(clone);
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl, *clone;
+ struct posix_acl *default_acl, *acl;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- error = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!error) {
- handle_t *handle;
- int retries = 0;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
- retry:
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext4_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
- ext4_journal_stop(handle);
- if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
+ if (default_acl) {
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
}
-out:
- posix_acl_release(clone);
- return error;
-}
-
-/*
- * Extended attribute handlers
- */
-static size_t
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext4_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
-retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- error = ext4_set_acl(handle, inode, type, acl);
- ext4_journal_stop(handle);
- if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-
-const struct xattr_handler ext4_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext4_xattr_list_acl_access,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
-
-const struct xattr_handler ext4_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext4_xattr_list_acl_default,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,19 +54,14 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-extern int ext4_check_acl(struct inode *, int);
-extern int ext4_acl_chmod(struct inode *);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext4_check_acl NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext4_get_acl NULL
+#define ext4_set_acl NULL
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c67..fca382037dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,12 +21,34 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
+#include <trace/events/ext4.h>
+
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
- * Calculate the block group number and offset, given a block number
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block)
+{
+ ext4_group_t group;
+
+ if (test_opt2(sb, STD_GROUP_SIZE))
+ group = (block -
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
+ (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+ else
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ return group;
+}
+
+/*
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
*/
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -35,7 +57,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_grpblk_t offset;
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
- offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+ offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+ EXT4_SB(sb)->s_cluster_bits;
if (offsetp)
*offsetp = offset;
if (blockgrpp)
@@ -43,140 +66,195 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
- ext4_group_t block_group)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+ ext4_fsblk_t block,
+ ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
- if (actual_group == block_group)
- return 1;
- return 0;
+
+ actual_group = ext4_get_group_number(sb, block);
+ return (actual_group == block_group) ? 1 : 0;
}
-static int ext4_group_used_meta_blocks(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
- ext4_fsblk_t tmp;
+ unsigned num_clusters;
+ int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+ ext4_fsblk_t itbl_blk;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* block bitmap, inode bitmap, and inode table blocks */
- int used_blocks = sbi->s_itb_per_group + 2;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
- if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
- block_group))
- used_blocks--;
-
- if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
- block_group))
- used_blocks--;
-
- tmp = ext4_inode_table(sb, gdp);
- for (; tmp < ext4_inode_table(sb, gdp) +
- sbi->s_itb_per_group; tmp++) {
- if (!ext4_block_in_group(sb, tmp, block_group))
- used_blocks -= 1;
+ /* This is the number of clusters used by the superblock,
+ * block group descriptors, and reserved block group
+ * descriptor blocks */
+ num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+ /*
+ * For the allocation bitmaps and inode table, we first need
+ * to check to see if the block is in the block group. If it
+ * is, then check to see if the cluster is already accounted
+ * for in the clusters used for the base metadata cluster, or
+ * if we can increment the base metadata cluster to include
+ * that block. Otherwise, we will have to track the cluster
+ * used for the allocation bitmap or inode table explicitly.
+ * Normally all of these blocks are contiguous, so the special
+ * case handling shouldn't be necessary except for *very*
+ * unusual file system layouts.
+ */
+ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+ block_cluster = EXT4_B2C(sbi,
+ ext4_block_bitmap(sb, gdp) - start);
+ if (block_cluster < num_clusters)
+ block_cluster = -1;
+ else if (block_cluster == num_clusters) {
+ num_clusters++;
+ block_cluster = -1;
}
}
- return used_blocks;
-}
-
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group, struct ext4_group_desc *gdp)
-{
- int bit, bit_max;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- unsigned free_blocks, group_blocks;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (bh) {
- J_ASSERT_BH(bh, buffer_locked(bh));
-
- /* If checksum is bad mark all blocks used to prevent allocation
- * essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u",
- block_group);
- ext4_free_blks_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- return 0;
+ if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+ inode_cluster = EXT4_B2C(sbi,
+ ext4_inode_bitmap(sb, gdp) - start);
+ if (inode_cluster < num_clusters)
+ inode_cluster = -1;
+ else if (inode_cluster == num_clusters) {
+ num_clusters++;
+ inode_cluster = -1;
}
- memset(bh->b_data, 0, sb->s_blocksize);
}
- /* Check for superblock and gdt backups in this group */
- bit_max = ext4_bg_has_super(sb, block_group);
-
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
- sbi->s_desc_per_block) {
- if (bit_max) {
- bit_max += ext4_bg_num_gdb(sb, block_group);
- bit_max +=
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ itbl_blk = ext4_inode_table(sb, gdp);
+ for (i = 0; i < sbi->s_itb_per_group; i++) {
+ if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+ c = EXT4_B2C(sbi, itbl_blk + i - start);
+ if ((c < num_clusters) || (c == inode_cluster) ||
+ (c == block_cluster) || (c == itbl_cluster))
+ continue;
+ if (c == num_clusters) {
+ num_clusters++;
+ continue;
+ }
+ num_clusters++;
+ itbl_cluster = c;
}
- } else { /* For META_BG_BLOCK_GROUPS */
- bit_max += ext4_bg_num_gdb(sb, block_group);
}
- if (block_group == ngroups - 1) {
+ if (block_cluster != -1)
+ num_clusters++;
+ if (inode_cluster != -1)
+ num_clusters++;
+
+ return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ unsigned int blocks;
+
+ if (block_group == ext4_get_groups_count(sb) - 1) {
/*
- * Even though mke2fs always initialize first and last group
- * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
- * to make sure we calculate the right free blocks
+ * Even though mke2fs always initializes the first and
+ * last group, just in case some other tool was used,
+ * we need to make sure we calculate the right free
+ * blocks.
*/
- group_blocks = ext4_blocks_count(sbi->s_es) -
- ext4_group_first_block_no(sb, ngroups - 1);
- } else {
- group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
- }
+ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, block_group);
+ } else
+ blocks = EXT4_BLOCKS_PER_GROUP(sb);
+ return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
+
+/* Initializes an uninitialized block bitmap */
+static void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ unsigned int bit, bit_max;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start, tmp;
+ int flex_bg = 0;
+ struct ext4_group_info *grp;
- free_blocks = group_blocks - bit_max;
+ J_ASSERT_BH(bh, buffer_locked(bh));
+
+ /* If checksum is bad mark all blocks used to prevent allocation
+ * essentially implementing a per-group read-only flag. */
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
+ }
+ memset(bh->b_data, 0, sb->s_blocksize);
- if (bh) {
- ext4_fsblk_t start, tmp;
- int flex_bg = 0;
+ bit_max = ext4_num_base_meta_clusters(sb, block_group);
+ for (bit = 0; bit < bit_max; bit++)
+ ext4_set_bit(bit, bh->b_data);
- for (bit = 0; bit < bit_max; bit++)
- ext4_set_bit(bit, bh->b_data);
+ start = ext4_group_first_block_no(sb, block_group);
- start = ext4_group_first_block_no(sb, block_group);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flex_bg = 1;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_FLEX_BG))
- flex_bg = 1;
+ /* Set bits for block and inode bitmaps, and inode table */
+ tmp = ext4_block_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
- /* Set bits for block and inode bitmaps, and inode table */
- tmp = ext4_block_bitmap(sb, gdp);
- if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
+ tmp = ext4_inode_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
- tmp = ext4_inode_bitmap(sb, gdp);
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
-
- tmp = ext4_inode_table(sb, gdp);
- for (; tmp < ext4_inode_table(sb, gdp) +
- sbi->s_itb_per_group; tmp++) {
- if (!flex_bg ||
- ext4_block_in_group(sb, tmp, block_group))
- ext4_set_bit(tmp - start, bh->b_data);
- }
- /*
- * Also if the number of blocks within the group is
- * less than the blocksize * 8 ( which is the size
- * of bitmap ), set rest of the block bitmap to 1
- */
- ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
- bh->b_data);
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
}
- return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+
+ /*
+ * Also if the number of blocks within the group is less than
+ * the blocksize * 8 ( which is the size of bitmap ), set rest
+ * of the block bitmap to 1
+ */
+ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+ sb->s_blocksize * 8, bh->b_data);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
}
+/* Return the number of free blocks in a block group. It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ return num_clusters_in_group(sb, block_group) -
+ ext4_num_overhead_clusters(sb, block_group, gdp);
+}
/*
* The free blocks are managed by bitmaps. A file system contains several
@@ -230,14 +308,19 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
return desc;
}
-static int ext4_valid_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- unsigned int block_group,
- struct buffer_head *bh)
+/*
+ * Return the block number which was discovered to be invalid, or 0 if
+ * the block bitmap is valid.
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
- ext4_fsblk_t bitmap_blk;
+ ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -247,41 +330,77 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
* or it has to also read the block group where the bitmaps
* are located to verify they are set.
*/
- return 1;
+ return 0;
}
group_first_block = ext4_group_first_block_no(sb, block_group);
/* check whether block bitmap block number is set */
- bitmap_blk = ext4_block_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ blk = ext4_block_bitmap(sb, desc);
+ offset = blk - group_first_block;
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode bitmap block number is set */
- bitmap_blk = ext4_inode_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ blk = ext4_inode_bitmap(sb, desc);
+ offset = blk - group_first_block;
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode table block number is set */
- bitmap_blk = ext4_inode_table(sb, desc);
- offset = bitmap_blk - group_first_block;
+ blk = ext4_inode_table(sb, desc);
+ offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- offset + EXT4_SB(sb)->s_itb_per_group,
- offset);
- if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
- /* good bitmap for inode tables */
- return 1;
-
-err_out:
- ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
- block_group, bitmap_blk);
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+ EXT4_B2C(sbi, offset));
+ if (next_zero_bit <
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
+ /* bad bitmap for inode tables */
+ return blk;
return 0;
}
+
+static void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (buffer_verified(bh))
+ return;
+
+ ext4_lock_group(sb, block_group);
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
+ }
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
+ }
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+}
+
/**
- * ext4_read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
@@ -291,10 +410,10 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -303,19 +422,19 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, bitmap_blk);
+ ext4_error(sb, "Cannot get buffer for block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
if (bitmap_uptodate(bh))
- return bh;
+ goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
- return bh;
+ goto verify;
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -334,204 +453,127 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
- return bh;
- }
- /*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
- */
- set_bitmap_uptodate(bh);
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, bitmap_blk);
- return NULL;
+ goto verify;
}
- ext4_valid_block_bitmap(sb, desc, block_group, bh);
/*
- * file system mounted not to panic on error,
- * continue with corrupt bitmap
+ * submit the buffer_head for reading
*/
+ set_buffer_new(bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
+verify:
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ if (buffer_verified(bh))
+ return bh;
+ put_bh(bh);
+ return NULL;
}
-/**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle: handle to this transaction
- * @sb: super block
- * @block: start physcial block to add to the block group
- * @count: number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count)
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head *bh)
{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
- ext4_group_t block_group;
- ext4_grpblk_t bit;
- unsigned int i;
struct ext4_group_desc *desc;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err = 0, ret, blk_free_count;
- ext4_grpblk_t blocks_freed;
- struct ext4_group_info *grp;
-
- ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
- ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- grp = ext4_get_group_info(sb, block_group);
- /*
- * Check to see if we are freeing blocks across a group
- * boundary.
- */
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- goto error_return;
- }
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!buffer_new(bh))
+ return 0;
+ desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- goto error_return;
-
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
- ext4_error(sb, "Adding blocks in system zones - "
- "Block = %llu, count = %lu",
- block, count);
- goto error_return;
+ return 1;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ ext4_error(sb, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
+ return 1;
}
+ clear_buffer_new(bh);
+ /* Panic or remount fs read-only if block bitmap is invalid */
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ /* ...but check for error just in case errors=continue. */
+ return !buffer_verified(bh);
+}
- /*
- * We are about to add blocks to the bitmap,
- * so we need undo access.
- */
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
- /*
- * make sure we don't allow a parallel init on other groups in the
- * same buddy cache
- */
- down_write(&grp->alloc_sem);
- for (i = 0, blocks_freed = 0; i < count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- blocks_freed++;
- }
- }
- ext4_lock_group(sb, block_group);
- blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
- ext4_free_blks_set(sb, desc, blk_free_count);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
- ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+ struct buffer_head *bh;
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(blocks_freed,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (!bh)
+ return NULL;
+ if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ put_bh(bh);
+ return NULL;
}
- /*
- * request to reload the buddy with the
- * new bitmap information
- */
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- grp->bb_free += blocks_freed;
- up_write(&grp->alloc_sem);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
- ext4_std_error(sb, err);
- return;
+ return bh;
}
/**
- * ext4_has_free_blocks()
+ * ext4_has_free_clusters()
* @sbi: in-core super block structure.
- * @nblocks: number of needed blocks
+ * @nclusters: number of needed blocks
+ * @flags: flags from ext4_mb_new_blocks()
*
- * Check if filesystem has nblocks free & available for allocation.
+ * Check if filesystem has nclusters free & available for allocation.
* On success return 1, return 0 on failure.
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- s64 free_blocks, dirty_blocks, root_blocks;
- struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
- struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
- free_blocks = percpu_counter_read_positive(fbc);
- dirty_blocks = percpu_counter_read_positive(dbc);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
-
- if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
- EXT4_FREEBLOCKS_WATERMARK) {
- free_blocks = percpu_counter_sum_positive(fbc);
- dirty_blocks = percpu_counter_sum_positive(dbc);
- if (dirty_blocks < 0) {
- printk(KERN_CRIT "Dirty block accounting "
- "went wrong %lld\n",
- (long long)dirty_blocks);
- }
+ s64 free_clusters, dirty_clusters, rsv, resv_clusters;
+ struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+ struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
+
+ free_clusters = percpu_counter_read_positive(fcc);
+ dirty_clusters = percpu_counter_read_positive(dcc);
+ resv_clusters = atomic64_read(&sbi->s_resv_clusters);
+
+ /*
+ * r_blocks_count should always be multiple of the cluster ratio so
+ * we are safe to do a plane bit shift only.
+ */
+ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+ resv_clusters;
+
+ if (free_clusters - (nclusters + rsv + dirty_clusters) <
+ EXT4_FREECLUSTERS_WATERMARK) {
+ free_clusters = percpu_counter_sum_positive(fcc);
+ dirty_clusters = percpu_counter_sum_positive(dcc);
}
- /* Check whether we have space after
- * accounting for current dirty blocks & root reserved blocks.
+ /* Check whether we have space after accounting for current
+ * dirty clusters & root reserved clusters.
*/
- if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+ if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
- /* Hm, nope. Are (enough) root reserved blocks available? */
- if (sbi->s_resuid == current_fsuid() ||
- ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE)) {
- if (free_blocks >= (nblocks + dirty_blocks))
+ /* Hm, nope. Are (enough) root reserved clusters available? */
+ if (uid_eq(sbi->s_resuid, current_fsuid()) ||
+ (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE) ||
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
+ if (free_clusters >= (nclusters + dirty_clusters +
+ resv_clusters))
+ return 1;
+ }
+ /* No free blocks. Let's see if we can dip into reserved pool */
+ if (flags & EXT4_MB_USE_RESERVED) {
+ if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
return 0;
}
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- if (ext4_has_free_blocks(sbi, nblocks)) {
- percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+ if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
return 0;
} else
return -ENOSPC;
@@ -544,14 +586,14 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
* return TRUE.
*
* if the total number of retries exceed three times, return FALSE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+ if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
@@ -567,14 +609,15 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
- * @count: pointer to total number of blocks needed
+ * @count: pointer to total number of clusters needed
* @errp: error code
*
* Return 1st allocated block number on success, *count stores total account
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
+ ext4_fsblk_t goal, unsigned int flags,
+ unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
@@ -584,6 +627,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode;
ar.goal = goal;
ar.len = count ? *count : 1;
+ ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
@@ -592,27 +636,30 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
* Account for the allocated meta blocks. We will never
* fail EDQUOT for metdata, but we do account for it.
*/
- if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+ if (!(*errp) &&
+ ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- dquot_alloc_block_nofail(inode, ar.len);
+ dquot_alloc_block_nofail(inode,
+ EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
return ret;
}
/**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
* @sb: superblock
*
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
*/
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
@@ -628,20 +675,26 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_blks_count(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
- x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
- i, ext4_free_blks_count(sb, gdp), x);
+ i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
- printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
- ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+ printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+ ", computed = %llu, %llu\n",
+ EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
@@ -650,7 +703,11 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_blks_count(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
@@ -659,21 +716,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
-}
-
-static int ext4_group_sparse(ext4_group_t group)
-{
- if (group <= 1)
- return 1;
- if (!(group & 1))
- return 0;
- return (test_root(group, 7) || test_root(group, 5) ||
- test_root(group, 3));
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
/**
@@ -686,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
- !ext4_group_sparse(group))
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (group == 0)
+ return 1;
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+ group == le32_to_cpu(es->s_backup_bgs[1]))
+ return 1;
+ return 0;
+ }
+ if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ return 1;
+ if (!(group & 1))
return 0;
- return 1;
+ if (test_root(group, 3) || (test_root(group, 5)) ||
+ test_root(group, 7))
+ return 1;
+
+ return 0;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
@@ -740,3 +806,76 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
}
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned num;
+
+ /* Check for superblock and gdt backups in this group */
+ num = ext4_bg_has_super(sb, block_group);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+ sbi->s_desc_per_block) {
+ if (num) {
+ num += ext4_bg_num_gdb(sb, block_group);
+ num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ }
+ } else { /* For META_BG_BLOCK_GROUPS */
+ num += ext4_bg_num_gdb(sb, block_group);
+ }
+ return EXT4_NUM_B2C(sbi, num);
+}
+/**
+ * ext4_inode_to_goal_block - return a hint for block allocation
+ * @inode: inode for block allocation
+ *
+ * Return the ideal location to start allocating blocks for a
+ * newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_group_t block_group;
+ ext4_grpblk_t colour;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+ ext4_fsblk_t bg_start;
+ ext4_fsblk_t last_block;
+
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+ last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
+ if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ else
+ colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ return bg_start + colour;
+}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index fa3af81ac56..3285aa5a706 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -11,21 +11,92 @@
#include <linux/jbd2.h>
#include "ext4.h"
-#ifdef EXT4FS_DEBUG
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
+{
+ return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
+}
+
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
-static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+ provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
-unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
+ return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
{
- unsigned int i, sum = 0;
-
- if (!map)
- return 0;
- for (i = 0; i < numchars; i++)
- sum += nibblemap[map->b_data[i] & 0xf] +
- nibblemap[(map->b_data[i] >> 4) & 0xf];
- return sum;
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+ gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
-#endif /* EXT4FS_DEBUG */
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
+
+ if (provided == calculated)
+ return 1;
+
+ return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh)
+{
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+ gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index fac90f3fba8..41eb9dcfac7 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -13,7 +13,6 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
-#include <linux/module.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
@@ -181,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
- struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
- struct rb_node *parent;
- struct ext4_system_zone *entry;
+ struct ext4_system_zone *entry, *n;
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- entry = rb_entry(n, struct ext4_system_zone, node);
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &EXT4_SB(sb)->system_blks, node)
kmem_cache_free(ext4_system_zone_cachep, entry);
- if (!parent)
- EXT4_SB(sb)->system_blks = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
EXT4_SB(sb)->system_blks = RB_ROOT;
}
@@ -246,3 +220,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
return 1;
}
+int ext4_check_blockref(const char *function, unsigned int line,
+ struct inode *inode, __le32 *p, unsigned int max)
+{
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ __le32 *bref = p;
+ unsigned int blk;
+
+ while (bref < p+max) {
+ blk = le32_to_cpu(*bref++);
+ if (blk &&
+ unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ blk, 1))) {
+ es->s_last_error_block = cpu_to_le64(blk);
+ ext4_error_inode(inode, function, line, blk,
+ "invalid block");
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40..ef1bed66c14 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,122 +27,130 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include "ext4.h"
+#include "xattr.h"
-static unsigned char ext4_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir);
-static int ext4_release_dir(struct inode *inode,
- struct file *filp);
-
-const struct file_operations ext4_dir_operations = {
- .llseek = ext4_llseek,
- .read = generic_read_dir,
- .readdir = ext4_readdir, /* we take BKL. needed?*/
- .unlocked_ioctl = ext4_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
-#endif
- .fsync = ext4_sync_file,
- .release = ext4_release_dir,
-};
-
+static int ext4_dx_readdir(struct file *, struct dir_context *);
-static unsigned char get_dtype(struct super_block *sb, int filetype)
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
- return DT_UNKNOWN;
+ struct super_block *sb = inode->i_sb;
- return (ext4_filetype_table[filetype]);
-}
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+ ext4_has_inline_data(inode)))
+ return 1;
+ return 0;
+}
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
int __ext4_check_dir_entry(const char *function, unsigned int line,
- struct inode *dir,
+ struct inode *dir, struct file *filp,
struct ext4_dir_entry_2 *de,
- struct buffer_head *bh,
+ struct buffer_head *bh, char *buf, int size,
unsigned int offset)
{
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (rlen < EXT4_DIR_REC_LEN(1))
+ if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
- error_msg = "directory entry across blocks";
- else if (le32_to_cpu(de->inode) >
- le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+ else if (unlikely(((char *) de - buf) + rlen > size))
+ error_msg = "directory entry across range";
+ else if (unlikely(le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else
+ return 0;
- if (error_msg != NULL)
+ if (filp)
+ ext4_error_file(filp, function, line, bh->b_blocknr,
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ else
ext4_error_inode(dir, function, line, bh->b_blocknr,
- "bad entry in directory: %s - "
- "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset%bh->b_size), offset,
- le32_to_cpu(de->inode),
- rlen, de->name_len);
- return error_msg == NULL ? 1 : 0;
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+
+ return 1;
}
-static int ext4_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned int offset;
- int i, stored;
+ int i;
struct ext4_dir_entry_2 *de;
- struct super_block *sb;
int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
- int ret = 0;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
int dir_has_error = 0;
- sb = inode->i_sb;
-
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
- err = ext4_dx_readdir(filp, dirent, filldir);
+ if (is_dx_dir(inode)) {
+ err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
+ return err;
}
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+ ext4_clear_inode_flag(file_inode(file),
EXT4_INODE_INDEX);
}
- stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ int ret = ext4_read_inline_dir(file, ctx,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
+ offset = ctx->pos & (sb->s_blocksize - 1);
+
+ while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
@@ -152,24 +160,37 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- EXT4_ERROR_INODE(inode, "directory "
- "contains a hole at offset %Lu",
- (unsigned long long) filp->f_pos);
+ EXT4_ERROR_FILE(file, 0,
+ "directory contains a "
+ "hole at offset %llu",
+ (unsigned long long) ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
+ continue;
+ }
+
+ /* Check the checksum */
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(inode,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_FILE(file, 0, "directory fails checksum "
+ "at offset %llu",
+ (unsigned long long)ctx->pos);
+ ctx->pos += sb->s_blocksize - offset;
+ brelse(bh);
continue;
}
+ set_buffer_verified(bh);
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -186,74 +207,129 @@ revalidate:
sb->s_blocksize);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (!ext4_check_dir_entry(inode, de,
- bh, offset)) {
+ if (ext4_check_dir_entry(inode, file, de, bh,
+ bh->b_data, bh->b_size,
+ offset)) {
/*
- * On error, skip the f_pos to the next block
+ * On error, skip to the next block
*/
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse(bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- filp->f_pos,
le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored++;
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
offset = 0;
brelse(bh);
+ if (ctx->pos < inode->i_size) {
+ if (!dir_relax(inode))
+ return 0;
+ }
}
-out:
- return ret;
+ return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
}
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT4_HTREE_EOF_32BIT;
+ else
+ return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
+ * directories, where the "offset" is in terms of the filename hash
+ * value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond offset limits,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * For non-htree, ext4_llseek already chooses the proper max offset.
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int dx_dir = is_dx_dir(inode);
+ loff_t htree_max = ext4_get_htree_eof(file);
+
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, whence,
+ htree_max, htree_max);
+ else
+ return ext4_llseek(file, offset, whence);
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -276,53 +352,29 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
struct fname *old = fname;
fname = fname->next;
kfree(old);
}
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
+ *root = RB_ROOT;
}
-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -397,63 +449,57 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file *filp, void *dirent,
- filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
- struct super_block *sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
- printk(KERN_ERR "EXT4-fs: call_filldir: called with "
- "null fname?!?\n");
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+ "called with null fname?!?", __func__, __LINE__,
+ inode->i_ino, current->comm);
return 0;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name,
+ fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return 1;
}
fname = fname->next;
}
return 0;
}
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext4_htree_create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == EXT4_HTREE_EOF)
+ if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -461,7 +507,7 @@ static int ext4_dx_readdir(struct file *filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -475,17 +521,17 @@ static int ext4_dx_readdir(struct file *filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -494,7 +540,7 @@ static int ext4_dx_readdir(struct file *filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -505,7 +551,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -513,7 +559,7 @@ static int ext4_dx_readdir(struct file *filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -524,3 +570,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
return 0;
}
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = ext4_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = ext4_readdir,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file,
+ .release = ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6a5edea2d70..7cc5a0e2368 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <crypto/hash.h>
+#include <linux/falloc.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -53,7 +56,17 @@
printk(KERN_DEBUG f, ## a); \
} while (0)
#else
-#define ext4_debug(f, a...) do {} while (0)
+#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -62,8 +75,8 @@
#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...) \
- ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+#define EXT4_ERROR_FILE(file, block, fmt, a...) \
+ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -108,7 +121,10 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_DELALLOC_RESERVED 0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC 0x0800
-
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED 0x2000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -142,10 +158,17 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED (1 << BH_Mapped)
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
-#define EXT4_MAP_UNINIT (1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_UNINIT)
+ EXT4_MAP_FROM_CLUSTER)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -155,52 +178,31 @@ struct ext4_map_blocks {
};
/*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_ERROR 0x0002
-
-struct ext4_io_page {
- struct page *p_page;
- atomic_t p_count;
-};
-
-#define MAX_IO_PAGES 128
+/*
+ * For converting unwritten extents on a work queue. 'handle' is used for
+ * buffered writeback.
+ */
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
unsigned int flag; /* unwritten or not */
- struct page *page; /* page struct for buffer write */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct work_struct work; /* data work queue */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
- int num_io_pages;
- struct ext4_io_page *pages[MAX_IO_PAGES];
+ atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
int io_op;
struct bio *io_bio;
ext4_io_end_t *io_end;
- struct ext4_io_page *io_page;
sector_t io_next_block;
};
@@ -209,6 +211,8 @@ struct ext4_io_submit {
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
#define EXT4_ROOT_INO 2 /* Root inode */
+#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
+#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -235,8 +239,11 @@ struct ext4_io_submit {
# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
+ EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
#endif
@@ -254,6 +261,24 @@ struct ext4_io_submit {
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+ (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \
+ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
+ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
+ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \
+ ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+
/*
* Structure of a blocks group descriptor
*/
@@ -266,7 +291,9 @@ struct ext4_group_desc
__le16 bg_free_inodes_count_lo;/* Free inodes count */
__le16 bg_used_dirs_count_lo; /* Directories count */
__le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
- __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
+ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */
+ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
__le16 bg_itable_unused_lo; /* Unused inodes count */
__le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
__le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
@@ -276,17 +303,27 @@ struct ext4_group_desc
__le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
__le16 bg_used_dirs_count_hi; /* Directories count MSB */
__le16 bg_itable_unused_hi; /* Unused inodes count MSB */
- __u32 bg_reserved2[3];
+ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */
+ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+ __u32 bg_reserved;
};
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+ sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+ sizeof(__le16))
+
/*
* Structure of a flex block group info
*/
struct flex_groups {
- atomic_t free_inodes;
- atomic_t free_blocks;
- atomic_t used_dirs;
+ atomic64_t free_clusters;
+ atomic_t free_inodes;
+ atomic_t used_dirs;
};
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -302,6 +339,7 @@ struct flex_groups {
#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
@@ -347,15 +385,15 @@ struct flex_groups {
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
- EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
- EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
@@ -404,28 +442,26 @@ enum {
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
-#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
- printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
- EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
-
-/*
- * Since it's pretty easy to mix up bit numbers and hex values, and we
- * can't do a compile-time test for ENUM values, we use a run-time
- * test to make sure that EXT4_XXX_FL is consistent with respect to
- * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
- * out so it won't cost any extra space in the compiled kernel image.
- * But it's important that these values are the same, since we are
- * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
- * must be consistent with the values of FS_XXX_FL defined in
- * include/linux/fs.h and the on-disk values found in ext2, ext3, and
- * ext4 filesystems, and of course the values defined in e2fsprogs.
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
static inline void ext4_check_flag_values(void)
{
CHECK_FLAG_VALUE(SECRM);
@@ -450,6 +486,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -488,30 +525,62 @@ struct ext4_new_group_data {
__u32 free_blocks_count;
};
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+ BLOCK_BITMAP = 0, /* block bitmap */
+ INODE_BITMAP, /* inode bitmap */
+ INODE_TABLE, /* inode tables */
+ GROUP_TABLE_COUNT,
+};
+
/*
* Flags used by ext4_map_blocks()
*/
- /* Allocate any needed blocks and/or convert an unitialized
+ /* Allocate any needed blocks and/or convert an unwritten
extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE 0x0001
- /* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
-#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
+ /* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
- /* Caller is from the delayed allocation writeout path,
- so set the magic i_delalloc_reserve_flag after taking the
- inode allocation semaphore for */
+ /* Caller is from the delayed allocation writeout path
+ * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
- unitialized extents if not allocated, split the uninitialized
+ unwritten extents if not allocated, split the unwritten
extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO 0x0008
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Eventual metadata allocation (due to growing extent tree)
+ * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
+ /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
+ /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
+ /* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
+ /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
+
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
/*
* Flags used by ext4_free_blocks
@@ -519,6 +588,10 @@ struct ext4_new_group_data {
#define EXT4_FREE_BLOCKS_METADATA 0x0001
#define EXT4_FREE_BLOCKS_FORGET 0x0002
#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RESERVE 0x0040
/*
* ioctl commands
@@ -529,9 +602,6 @@ struct ext4_new_group_data {
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
-#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
@@ -541,6 +611,9 @@ struct ext4_new_group_data {
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -554,30 +627,11 @@ struct ext4_new_group_data {
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
-#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
-
-/*
- * Mount options
- */
-struct ext4_mount_options {
- unsigned long s_mount_opt;
- uid_t s_resuid;
- gid_t s_resgid;
- unsigned long s_commit_interval;
- u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
- int s_jquota_fmt;
- char *s_qf_names[MAXQUOTAS];
-#endif
-};
-
-/* Max physical block we can addres w/o extents */
+/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
/*
@@ -617,7 +671,8 @@ struct ext4_inode {
__le16 l_i_file_acl_high;
__le16 l_i_uid_high; /* these 2 fields */
__le16 l_i_gid_high; /* were reserved2[0] */
- __u32 l_i_reserved2;
+ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+ __le16 l_i_reserved;
} linux2;
struct {
__le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
@@ -633,7 +688,7 @@ struct ext4_inode {
} masix2;
} osd2; /* OS dependent 2 */
__le16 i_extra_isize;
- __le16 i_pad1;
+ __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */
__le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
__le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
__le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
@@ -709,6 +764,8 @@ do { \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
ext4_decode_extra_time(&(inode)->xtime, \
raw_inode->xtime ## _extra); \
+ else \
+ (inode)->xtime.tv_nsec = 0; \
} while (0)
#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
@@ -716,9 +773,13 @@ do { \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
(einode)->xtime.tv_sec = \
(signed)le32_to_cpu((raw_inode)->xtime); \
+ else \
+ (einode)->xtime.tv_sec = 0; \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
ext4_decode_extra_time(&(einode)->xtime, \
raw_inode->xtime ## _extra); \
+ else \
+ (einode)->xtime.tv_nsec = 0; \
} while (0)
#define i_disk_version osd1.linux1.l_i_version
@@ -731,7 +792,7 @@ do { \
#define i_gid_low i_gid
#define i_uid_high osd2.linux2.l_i_uid_high
#define i_gid_high osd2.linux2.l_i_gid_high
-#define i_reserved2 osd2.linux2.l_i_reserved2
+#define i_checksum_lo osd2.linux2.l_i_checksum_lo
#elif defined(__GNU__)
@@ -748,15 +809,7 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
- ext4_fsblk_t ec_start;
- ext4_lblk_t ec_block;
- __u32 ec_len; /* must be 32bit to return holes */
- __u32 ec_type;
-};
+#include "extents_status.h"
/*
* fourth extended file system inode data in memory
@@ -774,11 +827,12 @@ struct ext4_inode_info {
* near to their parent directory's inode.
*/
ext4_group_t i_block_group;
+ ext4_lblk_t i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
unsigned long i_state_flags; /* Dynamic state flags */
+#endif
unsigned long i_flags;
- ext4_lblk_t i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
@@ -787,7 +841,6 @@ struct ext4_inode_info {
* EAs.
*/
struct rw_semaphore xattr_sem;
-#endif
struct list_head i_orphan; /* unlinked but open inodes */
@@ -820,9 +873,10 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
- struct jbd2_inode jinode;
+ struct jbd2_inode *jinode;
+
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
- struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -833,32 +887,52 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* extents status tree */
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+ struct list_head i_es_lru;
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
+
/* ialloc */
ext4_group_t i_last_alloc_group;
/* allocation reservation info for delalloc */
+ /* In case of bigalloc, these refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks;
- unsigned short i_delalloc_reserved_flag;
- sector_t i_da_metadata_calc_last_lblock;
+ ext4_lblk_t i_da_metadata_calc_last_lblock;
int i_da_metadata_calc_len;
/* on-disk additional length */
__u16 i_extra_isize;
- spinlock_t i_block_reservation_lock;
+ /* Indicate the inline data space. */
+ u16 i_inline_off;
+ u16 i_inline_size;
+
#ifdef CONFIG_QUOTA
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
- /* completed IOs that might need unwritten extents handling */
- struct list_head i_completed_io_list;
+ /* Lock protecting lists below */
spinlock_t i_completed_io_lock;
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ struct work_struct i_rsv_conversion_work;
+
+ spinlock_t i_block_reservation_lock;
/*
* Transactions that contain inode's metadata needed to complete
@@ -866,6 +940,9 @@ struct ext4_inode_info {
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+
+ /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+ __u32 i_csum_seed;
};
/*
@@ -883,14 +960,14 @@ struct ext4_inode_info {
#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
/*
- * Mount flags
+ * Mount flags set via mount options or defaults
*/
-#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
@@ -909,26 +986,50 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
-#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
+ specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
+ size of blocksize * 8
+ blocks */
+#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
+ file systems */
+
+#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
+ ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
+ EXT4_MOUNT_##opt
#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
EXT4_MOUNT_##opt)
-#define ext4_set_bit ext2_set_bit
+#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
+ ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
+ EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
+ EXT4_MOUNT2_##opt)
+
+#define ext4_test_and_set_bit __test_and_set_bit_le
+#define ext4_set_bit __set_bit_le
#define ext4_set_bit_atomic ext2_set_bit_atomic
-#define ext4_clear_bit ext2_clear_bit
+#define ext4_test_and_clear_bit __test_and_clear_bit_le
+#define ext4_clear_bit __clear_bit_le
#define ext4_clear_bit_atomic ext2_clear_bit_atomic
-#define ext4_test_bit ext2_test_bit
-#define ext4_find_first_zero_bit ext2_find_first_zero_bit
-#define ext4_find_next_zero_bit ext2_find_next_zero_bit
-#define ext4_find_next_bit ext2_find_next_bit
+#define ext4_test_bit test_bit_le
+#define ext4_find_next_zero_bit find_next_zero_bit_le
+#define ext4_find_next_bit find_next_bit_le
+
+extern void ext4_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -944,6 +1045,9 @@ struct ext4_inode_info {
#define EXT4_ERRORS_PANIC 3 /* Panic */
#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM 1
+
/*
* Structure of the super block
*/
@@ -955,9 +1059,9 @@ struct ext4_super_block {
/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
__le32 s_first_data_block; /* First Data Block */
__le32 s_log_block_size; /* Block size */
- __le32 s_obso_log_frag_size; /* Obsoleted fragment size */
+ __le32 s_log_cluster_size; /* Allocation cluster size */
/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
- __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */
+ __le32 s_clusters_per_group; /* # Clusters per group */
__le32 s_inodes_per_group; /* # Inodes per group */
__le32 s_mtime; /* Mount time */
/*30*/ __le32 s_wtime; /* Write time */
@@ -1026,11 +1130,11 @@ struct ext4_super_block {
__le16 s_want_extra_isize; /* New inodes should reserve # bytes */
__le32 s_flags; /* Miscellaneous flags */
__le16 s_raid_stride; /* RAID stride */
- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
- __u8 s_reserved_char_pad;
+ __u8 s_checksum_type; /* metadata checksum algorithm used */
__le16 s_reserved_pad;
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
__le32 s_snapshot_inum; /* Inode number of active snapshot */
@@ -1053,7 +1157,12 @@ struct ext4_super_block {
__u8 s_last_error_func[32]; /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
__u8 s_mount_opts[64];
- __le32 s_reserved[112]; /* Padding to the end of the block */
+ __le32 s_usr_quota_inum; /* inode for tracking user quota */
+ __le32 s_grp_quota_inum; /* inode for tracking group quota */
+ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
+ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
+ __le32 s_reserved[106]; /* Padding to the end of the block */
+ __le32 s_checksum; /* crc32c(superblock) */
};
#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1073,23 +1182,28 @@ struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
unsigned long s_inodes_per_block;/* Number of inodes per block */
unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_clusters_per_group; /* Number of clusters in a group */
unsigned long s_inodes_per_group;/* Number of inodes in a group */
unsigned long s_itb_per_group; /* Number of inode table blocks per group */
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
- unsigned long s_overhead_last; /* Last calculated overhead */
- unsigned long s_blocks_last; /* Last seen block count */
+ unsigned long s_overhead; /* # of fs overhead clusters */
+ unsigned int s_cluster_ratio; /* Number of blocks per cluster */
+ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
struct buffer_head **s_group_desc;
unsigned int s_mount_opt;
+ unsigned int s_mount_opt2;
unsigned int s_mount_flags;
+ unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
- uid_t s_resuid;
- gid_t s_resgid;
+ atomic64_t s_resv_clusters;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned short s_mount_state;
unsigned short s_pad;
int s_addr_per_block_bits;
@@ -1103,28 +1217,26 @@ struct ext4_sb_info {
u32 s_hash_seed[4];
int s_def_hash_version;
int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
- struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeclusters_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
- struct percpu_counter s_dirtyblocks_counter;
+ struct percpu_counter s_dirtyclusters_counter;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
struct completion s_kobj_unregister;
+ struct super_block *s_sb;
/* Journaling */
struct journal_s *s_journal;
struct list_head s_orphan;
struct mutex s_orphan_lock;
- struct mutex s_resize_lock;
+ unsigned long s_resize_flags; /* Flags indicating if there
+ is a resizer */
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
-#endif
#ifdef CONFIG_QUOTA
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
@@ -1148,6 +1260,7 @@ struct ext4_sb_info {
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
+ unsigned int s_group_info_size;
/* tunables */
unsigned long s_stripe;
@@ -1157,7 +1270,7 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
- unsigned int s_max_writeback_mb_bump;
+ unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -1185,11 +1298,15 @@ struct ext4_sb_info {
unsigned long s_sectors_written_start;
u64 s_kbytes_written;
+ /* the size of zero-out chunk */
+ unsigned int s_extent_max_zeroout_kb;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+ ext4_group_t s_flex_groups_allocated;
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
/* timer for periodic error stats printing */
struct timer_list s_err_report;
@@ -1198,6 +1315,31 @@ struct ext4_sb_info {
struct ext4_li_request *s_li_request;
/* Wait multiplier for lazy initialization thread */
unsigned int s_li_wait_mult;
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
+
+ /* record the last minlen when FITRIM is called. */
+ atomic_t s_last_trim_minblks;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
+
+ /* Precomputed FS UUID checksum for seeding other checksums */
+ __u32 s_csum_seed;
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
+ struct percpu_counter s_extent_cache_cnt;
+ struct mb_cache *s_mb_cache;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+ struct ratelimit_state s_err_ratelimit_state;
+ struct ratelimit_state s_warning_ratelimit_state;
+ struct ratelimit_state s_msg_ratelimit_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1218,12 +1360,34 @@ static inline struct timespec ext4_current_time(struct inode *inode)
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
+ ino == EXT4_USR_QUOTA_INO ||
+ ino == EXT4_GRP_QUOTA_INO ||
+ ino == EXT4_BOOT_LOADER_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
+ }
+}
+
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
@@ -1236,24 +1400,55 @@ enum {
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
+ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
+ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
+ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
-#define EXT4_INODE_BIT_FNS(name, field) \
+#define EXT4_INODE_BIT_FNS(name, field, offset) \
static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
{ \
- return test_bit(bit, &EXT4_I(inode)->i_##field); \
+ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
} \
static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
{ \
- set_bit(bit, &EXT4_I(inode)->i_##field); \
+ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
} \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{ \
- clear_bit(bit, &EXT4_I(inode)->i_##field); \
+ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+}
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ (ei)->i_state_flags = 0;
}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
-EXT4_INODE_BIT_FNS(flag, flags)
-EXT4_INODE_BIT_FNS(state, state_flags)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ /* We depend on the fact that callers will set i_flags */
+}
+#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1312,6 +1507,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1320,6 +1516,15 @@ EXT4_INODE_BIT_FNS(state, state_flags)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums. However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1332,6 +1537,24 @@ EXT4_INODE_BIT_FNS(state, state_flags)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
+
+#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1339,14 +1562,19 @@ EXT4_INODE_BIT_FNS(state, state_flags)
EXT4_FEATURE_INCOMPAT_META_BG| \
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
- EXT4_FEATURE_INCOMPAT_FLEX_BG)
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
+ EXT4_FEATURE_RO_COMPAT_QUOTA)
/*
* Default values for user and/or group using reserved blocks
@@ -1412,6 +1640,23 @@ struct ext4_dir_entry_2 {
};
/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+ __le32 det_reserved_zero1; /* Pretend to be unused */
+ __le16 det_rec_len; /* 12 */
+ __u8 det_reserved_zero2; /* Zero name length */
+ __u8 det_reserved_ft; /* 0xDE, fake file type */
+ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+ ((blocksize) - \
+ sizeof(struct ext4_dir_entry_tail))))
+
+/*
* Ext4 directory file types. Only the low 3 bits are used. The
* other bits are reserved for now.
*/
@@ -1426,6 +1671,8 @@ struct ext4_dir_entry_2 {
#define EXT4_FT_MAX 8
+#define EXT4_FT_DIR_CSUM 0xDE
+
/*
* EXT4_DIR_PAD defines the directory entries boundaries
*
@@ -1494,6 +1741,27 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[4];
+ } desc;
+ int err;
+
+ BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
+
+ desc.shash.tfm = sbi->s_chksum_driver;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+
+ return *(u32 *)desc.ctx;
+}
+
#ifdef __KERNEL__
/* hash info structure used by the directory hash */
@@ -1505,7 +1773,11 @@ struct dx_hash_info
u32 *seed;
};
-#define EXT4_HTREE_EOF 0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
/*
* Control parameters used by ext4_htree_next_block
@@ -1556,9 +1828,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/
#define ERR_BAD_DX_DIR -75000
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
- ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
/*
* Timeout and state flag for lazy initialization inode thread.
*/
@@ -1572,12 +1841,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
*/
struct ext4_lazy_init {
unsigned long li_state;
-
- wait_queue_head_t li_wait_daemon;
- wait_queue_head_t li_wait_task;
- struct timer_list li_timer;
- struct task_struct *li_task;
-
struct list_head li_request_list;
struct mutex li_list_mtx;
};
@@ -1597,6 +1860,68 @@ struct ext4_features {
};
/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+ __le32 mmp_magic; /* Magic number for MMP */
+ __le32 mmp_seq; /* Sequence no. updated periodically */
+
+ /*
+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+ * purposes and do not affect the correctness of the algorithm
+ */
+ __le64 mmp_time; /* Time last updated */
+ char mmp_nodename[64]; /* Node which last updated MMP block */
+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
+
+ /*
+ * mmp_check_interval is used to verify if the MMP block has been
+ * updated on the block device. The value is updated based on the
+ * maximum time to write the MMP block during an update cycle.
+ */
+ __le16 mmp_check_interval;
+
+ __le16 mmp_pad1;
+ __le32 mmp_pad2[226];
+ __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
+ struct super_block *sb; /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT 2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
+
+/*
* Function prototypes
*/
@@ -1609,9 +1934,28 @@ struct ext4_features {
# define NORET_AND noreturn,
/* bitmap.c */
-extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh);
/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp,
+ ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block);
+
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1620,46 +1964,94 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count);
-extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+ ext4_fsblk_t goal,
+ unsigned int flags,
+ unsigned long *count,
+ int *errp);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
- ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc) \
- ext4_init_block_bitmap(sb, NULL, group, desc)
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+ ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+ struct file *,
struct ext4_dir_entry_2 *,
- struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
- __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+ struct buffer_head *, char *, int,
+ unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
+ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return ext4_filetype_table[filetype];
+}
/* fsync.c */
-extern int ext4_sync_file(struct file *, int);
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
- const struct qstr *qstr, __u32 goal);
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+ 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+ type, nblocks) \
+ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ (type), __LINE__, (nblocks))
+
+
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1668,11 +2060,12 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
-extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
@@ -1683,8 +2076,12 @@ extern void ext4_exit_mballoc(void);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+ ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */
@@ -1692,8 +2089,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA 2
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *);
@@ -1703,11 +2115,13 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1715,24 +2129,53 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
+
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop);
+
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -1740,40 +2183,108 @@ extern int ext4_group_add(struct super_block *sb,
extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
-extern void __ext4_error(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
-#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
- __LINE__, ## message)
-extern void ext4_error_inode(struct inode *, const char *, unsigned int,
- ext4_fsblk_t, const char *, ...)
- __attribute__ ((format (printf, 5, 6)));
-extern void ext4_error_file(struct file *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
+extern int ext4_calculate_overhead(struct super_block *sb);
+extern void ext4_superblock_csum_set(struct super_block *sb);
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+ ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16]);
+
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern void __ext4_abort(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
-#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
- __LINE__, ## message)
-extern void __ext4_warning(struct super_block *, const char *, unsigned int,
- const char *, ...)
- __attribute__ ((format (printf, 4, 5)));
-#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
- __LINE__, ## message)
-extern void ext4_msg(struct super_block *, const char *, const char *, ...)
- __attribute__ ((format (printf, 3, 4)));
-extern void __ext4_grp_locked_error(const char *, unsigned int, \
- struct super_block *, ext4_group_t, \
- unsigned long, ext4_fsblk_t, \
- const char *, ...)
- __attribute__ ((format (printf, 7, 8)));
-#define ext4_grp_locked_error(sb, grp, message...) \
- __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(3, 4)
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+ const char *, unsigned int, const char *);
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+ struct super_block *, ext4_group_t,
+ unsigned long, ext4_fsblk_t,
+ const char *, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -1787,8 +2298,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg);
-extern __u32 ext4_free_blks_count(struct super_block *sb,
- struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1801,18 +2312,28 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
-extern void ext4_free_blks_set(struct super_block *sb,
- struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg,
+ __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+ struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+ return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
@@ -1874,6 +2395,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -1911,25 +2433,24 @@ do { \
} while (0)
#ifdef CONFIG_SMP
-/* Each CPU can accumulate percpu_counter_batch blocks in their local
- * counters. So we need to make sure we have free blocks more
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
* than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
*/
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
-#define EXT4_FREEBLOCKS_WATERMARK 0
+#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
- /*
- * XXX: replace with spinlock if seen contended -bzzz
- */
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+ !mutex_is_locked(&inode->i_mutex));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
up_write(&EXT4_I(inode)->i_data_sem);
- return ;
}
struct ext4_group_info {
@@ -1950,10 +2471,24 @@ struct ext4_group_info {
* 5 free 8-block regions. */
};
-#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
+ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp) \
+ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
+ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -1999,11 +2534,18 @@ static inline void ext4_unlock_group(struct super_block *sb,
spin_unlock(ext4_group_lock_ptr(sb, group));
}
-static inline void ext4_mark_super_dirty(struct super_block *sb)
-{
- if (EXT4_SB(sb)->s_journal == NULL)
- sb->s_dirt =1;
-}
+/*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_ind_check_inode(inode) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
/*
* Inodes and files operations
@@ -2017,10 +2559,104 @@ extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ struct dir_context *ctx,
+ int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2034,54 +2670,104 @@ extern void ext4_exit_system_zone(void);
extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
ext4_fsblk_t start_blk,
unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+ struct inode *, __le32 *, unsigned int);
/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
- int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+ struct ext4_ext_path *,
+ struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *,
+ int flags);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+ struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent);
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_wait(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc);
+ struct writeback_control *wbc,
+ bool keep_towrite);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
enum ext4_state_bits {
- BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
+ BH_AllocFromCluster /* allocated blocks were part of already
+ * allocated cluster. */
+ = BH_JBDPrivateStart
};
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
/*
- * Add new method to test wether block and inode bitmaps are properly
+ * Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
* to mark the bitmap uptodate. We need to also zero-out the bitmap
*/
@@ -2097,8 +2783,36 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ 37
+#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#define EXT4_RESIZING 0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd..a867f5ca999 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
#define CHECK_BINSEARCH__
/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(a...) printk(a)
-#else
-#define ext_debug(a...)
-#endif
-
-/*
* If EXT_STATS is defined then stats numbers are collected.
* These number will be displayed at umount time.
*/
@@ -63,9 +53,22 @@
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
*/
/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long. It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes. Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+ __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */
+};
+
+/*
* This is the extent on-disk structure.
* It's used at the bottom of the tree.
*/
@@ -101,6 +104,17 @@ struct ext4_extent_header {
#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+ (sizeof(struct ext4_extent_header) + \
+ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+ return (struct ext4_extent_tail *)(((void *)eh) +
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
/*
* Array of ext4_ext_path contains path to some extent.
* Creation/lookup routines use it for traversal/splitting/etc.
@@ -119,46 +133,25 @@ struct ext4_ext_path {
* structure for external API
*/
-#define EXT4_EXT_CACHE_NO 0
-#define EXT4_EXT_CACHE_GAP 1
-#define EXT4_EXT_CACHE_EXTENT 2
-
-/*
- * to be called by ext4_ext_walk_space()
- * negative retcode - error
- * positive retcode - signal for ext4_ext_walk_space(), see below
- * callback must return valid extent (passed or newly created)
- */
-typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
- struct ext4_ext_cache *,
- struct ext4_extent *, void *);
-
-#define EXT_CONTINUE 0
-#define EXT_BREAK 1
-#define EXT_REPEAT 2
-
-/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
-#define EXT_MAX_BLOCK 0xffffffff
-
/*
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
- * particular extent is an initialized extent or an uninitialized (i.e.
+ * particular extent is an initialized extent or an unwritten (i.e.
* preallocated).
- * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
- * uninitialized extent.
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
* If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
- * uninitialized one. In other words, if MSB of ee_len is set, it is an
- * uninitialized extent with only one special scenario when ee_len = 0x8000.
- * In this case we can not have an uninitialized extent of zero length and
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
* thus we make it as a special case of initialized extent with 0x8000 length.
* This way we get better extent-to-group alignment for initialized extents.
* Hence, the maximum number of blocks we can have in an *initialized*
- * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
*/
#define EXT_INIT_MAX_LEN (1UL << 15)
-#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1)
#define EXT_FIRST_EXTENT(__hdr__) \
@@ -194,20 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void
-ext4_ext_invalidate_cache(struct inode *inode)
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
- EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
-}
-
-static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
-{
- /* We can not have an uninitialized extent of zero length! */
+ /* We can not have an unwritten extent of zero length! */
BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}
-static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
/* Extent with ee_len of 0x8000 is treated as an initialized extent */
return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
@@ -277,19 +264,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- sector_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
- int num,
- struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path);
+
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba9..0074e0d23d6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,25 +6,154 @@
#include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
{
- int err = 0;
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_undo_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__, bh,
- handle, err);
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+static int ext4_journal_check_start(struct super_block *sb)
+{
+ journal_t *journal;
+
+ might_sleep();
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+ journal = EXT4_SB(sb)->s_journal;
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (journal && is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return -EROFS;
+ }
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+ ext4_put_nojournal(handle);
+ return 0;
}
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
+ if (err)
+ __ext4_std_error(sb, where, line, err);
return err;
}
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
+{
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn,
+ struct buffer_head *bh,
+ handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+ caller, line, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;
+ might_sleep();
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
@@ -121,11 +250,36 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
{
int err = 0;
+ might_sleep();
+
+ set_buffer_meta(bh);
+ set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
+ /* Errors can only happen if there is a bug */
+ if (WARN_ON_ONCE(err)) {
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
+ if (inode == NULL) {
+ pr_err("EXT4: jbd2_journal_dirty_metadata "
+ "failed: handle type %u started at "
+ "line %u, credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ return err;
+ }
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "journal_dirty_metadata failed: "
+ "handle type %u started at line %u, "
+ "credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ }
} else {
if (inode)
mark_buffer_dirty_inode(bh, inode);
@@ -155,12 +309,13 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
int err = 0;
+ ext4_superblock_csum_set(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
} else
- sb->s_dirt = 1;
+ mark_buffer_dirty(bh);
return err;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c..17c00ff202f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 27U : 8U)
+ ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -59,12 +61,6 @@
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
- * generous. We can grow the delete transaction later if necessary. */
-
-#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
-
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
@@ -86,15 +82,21 @@
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
-
-#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
@@ -104,6 +106,113 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * so we need to reserve credits for this eventuality
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+ return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC 0
+#define EXT4_HT_INODE 1
+#define EXT4_HT_WRITE_PAGE 2
+#define EXT4_HT_MAP_BLOCKS 3
+#define EXT4_HT_DIR 4
+#define EXT4_HT_TRUNCATE 5
+#define EXT4_HT_QUOTA 6
+#define EXT4_HT_RESIZE 7
+#define EXT4_HT_MIGRATE 8
+#define EXT4_HT_MOVE_EXTENTS 9
+#define EXT4_HT_XATTR 10
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
+
+/**
+ * struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ * This struct is a 'seed' structure for a using with your own callback
+ * structs. If you are using callbacks you must allocate one of these
+ * or another struct of your own definition which has this struct
+ * as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+ /* list information for other callbacks attached to the same handle */
+ struct list_head jce_list;
+
+ /* Function to call with this callback structure */
+ void (*jce_func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int error);
+
+ /* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ * @sb: superblock of current filesystem for transaction
+ * @jce: returned journal callback data
+ * @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+ void (*func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc),
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ /* Add the jce to transaction's private list */
+ jce->jce_func = func;
+ spin_lock(&sbi->s_md_lock);
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ * Return true if object was successfully removed
+ */
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ bool deleted;
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ spin_lock(&sbi->s_md_lock);
+ deleted = !list_empty(&jce->jce_list);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ return deleted;
+}
+
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -122,13 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD.
*/
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err);
-
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
@@ -146,8 +248,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
@@ -161,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_super(handle, sb) \
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -181,13 +282,6 @@ static inline void ext4_handle_sync(handle_t *handle)
handle->h_sync = 1;
}
-static inline void ext4_handle_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_release_buffer(handle, bh);
-}
-
static inline int ext4_handle_is_aborted(handle_t *handle)
{
if (ext4_handle_valid(handle))
@@ -202,21 +296,38 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
return 1;
}
-static inline void ext4_journal_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_release_buffer(handle, bh);
-}
+#define ext4_journal_start_sb(sb, type, nblocks) \
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start(inode, type, nblocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
-static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+ unsigned int line, int type,
+ int blocks, int rsv_blocks)
{
- return ext4_journal_start_sb(inode->i_sb, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
@@ -253,7 +364,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+ return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
return 0;
}
@@ -273,43 +384,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
/* super.c */
int ext4_force_commit(struct super_block *sb);
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
{
if (EXT4_JOURNAL(inode) == NULL)
- return 0;
- if (!S_ISREG(inode->i_mode))
- return 1;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- return 1;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 1;
- return 0;
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ else
+ BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}
static inline int ext4_should_order_data(struct inode *inode)
{
- if (EXT4_JOURNAL(inode) == NULL)
- return 0;
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- return 1;
- return 0;
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}
static inline int ext4_should_writeback_data(struct inode *inode)
{
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (EXT4_JOURNAL(inode) == NULL)
- return 1;
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- return 1;
- return 0;
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1f..4da228a0e6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,7 +29,6 @@
* - smart tree reduction
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -38,11 +37,81 @@
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/falloc.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
+#include "xattr.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
+
+#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
+
+static __le32 ext4_extent_block_csum(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+ return cpu_to_le32(csum);
+}
+
+static int ext4_extent_block_csum_verify(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ et = find_ext4_extent_tail(eh);
+ if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+ return 0;
+ return 1;
+}
+
+static void ext4_extent_block_csum_set(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ et = find_ext4_extent_tail(eh);
+ et->et_checksum = ext4_extent_block_csum(inode, eh);
+}
+
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags);
+
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags);
+
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct extent_status *newes);
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
@@ -74,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
{
if (path->p_bh) {
/* path points to block */
+ BUFFER_TRACE(path->p_bh, "get_write_access");
return ext4_journal_get_write_access(handle, path->p_bh);
}
/* path points to leaf/index in inode body */
@@ -87,13 +157,15 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path)
{
int err;
if (path->p_bh) {
+ ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
/* path points to block */
- err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+ err = __ext4_handle_dirty_metadata(where, line, handle,
+ inode, path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -105,23 +177,37 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- ext4_group_t block_group;
- int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
- int depth;
-
if (path) {
+ int depth = path->p_depth;
struct ext4_extent *ex;
- depth = path->p_depth;
- /* try to predict block placement */
+ /*
+ * Try to predict block placement assuming that we are
+ * filling in a file which will eventually be
+ * non-sparse --- i.e., in the case of libbfd writing
+ * an ELF object sections out-of-order but in a way
+ * the eventually results in a contiguous object or
+ * executable file, or some database extending a table
+ * space file. However, this is actually somewhat
+ * non-ideal if we are writing a sparse file such as
+ * qemu or KVM writing a raw image file that is going
+ * to stay fairly sparse, since it will end up
+ * fragmenting the file system's free space. Maybe we
+ * should have some hueristics or some way to allow
+ * userspace to pass a hint to file system,
+ * especially if the latter case turns out to be
+ * common.
+ */
ex = path[depth].p_ext;
- if (ex)
- return (ext4_ext_pblock(ex) +
- (block - le32_to_cpu(ex->ee_block)));
+ if (ex) {
+ ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+ ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+ if (block > ext_block)
+ return ext_pblk + (block - ext_block);
+ else
+ return ext_pblk - (ext_block - block);
+ }
/* it looks like index is empty;
* try to find starting block from index itself */
@@ -130,36 +216,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}
/* OK. use inode's group */
- block_group = ei->i_block_group;
- if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
- /*
- * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
- * block groups per flexgroup, reserve the first block
- * group for directories and special files. Regular
- * files will start at the second block group. This
- * tends to speed up directory access and improves
- * fsck times.
- */
- block_group &= ~(flex_size-1);
- if (S_ISREG(inode->i_mode))
- block_group++;
- }
- bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- /*
- * If we are doing delayed allocation, we don't need take
- * colour into account.
- */
- if (test_opt(inode->i_sb, DELALLOC))
- return bg_start;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour + block;
+ return ext4_inode_to_goal_block(inode);
}
/*
@@ -168,12 +225,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex, int *err)
+ struct ext4_extent *ex, int *err, unsigned int flags)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, err);
return newblock;
}
@@ -183,12 +241,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (!check && size > 6)
+ size = 6;
#endif
- }
return size;
}
@@ -198,12 +254,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (!check && size > 5)
+ size = 5;
#endif
- }
return size;
}
@@ -214,12 +268,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (!check && size > 3)
+ size = 3;
#endif
- }
return size;
}
@@ -230,12 +282,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (!check && size > 4)
+ size = 4;
#endif
- }
return size;
}
@@ -244,10 +294,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
* to allocate @blocks
* Worse case is one block per extent
*/
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs, num = 0;
+ int idxs;
idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx));
@@ -262,6 +312,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
*/
if (ei->i_da_metadata_calc_len &&
ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ int num = 0;
+
if ((ei->i_da_metadata_calc_len % idxs) == 0)
num++;
if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -308,7 +360,11 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
+ ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+ ext4_lblk_t last = lblock + len - 1;
+ if (lblock > last)
+ return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -324,8 +380,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
int depth)
{
- struct ext4_extent *ext;
- struct ext4_extent_idx *ext_idx;
unsigned short entries;
if (eh->eh_entries == 0)
return 1;
@@ -334,15 +388,30 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
- ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ ext4_fsblk_t pblock = 0;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+ int len = 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
+
+ /* Check for overlapping extents */
+ lblock = le32_to_cpu(ext->ee_block);
+ len = ext4_ext_get_actual_len(ext);
+ if ((lblock <= prev) && prev) {
+ pblock = ext4_ext_pblock(ext);
+ es->s_last_error_block = cpu_to_le64(pblock);
+ return 0;
+ }
ext++;
entries--;
+ prev = lblock + len - 1;
}
} else {
- ext_idx = EXT_FIRST_INDEX(eh);
+ struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
@@ -355,7 +424,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth)
+ int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0;
@@ -385,25 +454,158 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid extent entries";
goto corrupted;
}
+ /* Verify checksum on non-root extent tree nodes */
+ if (ext_depth(inode) != depth &&
+ !ext4_extent_block_csum_verify(inode, eh)) {
+ error_msg = "extent tree corrupted";
+ goto corrupted;
+ }
return 0;
corrupted:
ext4_error_inode(inode, function, line, 0,
- "bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- error_msg, le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
-
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return -EIO;
}
-#define ext4_ext_check(inode, eh, depth) \
- __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk) \
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
int ext4_ext_check_inode(struct inode *inode)
{
- return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
+}
+
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+ struct inode *inode, ext4_fsblk_t pblk, int depth,
+ int flags)
+{
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(inode->i_sb, pblk);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+ err = bh_submit_read(bh);
+ if (err < 0)
+ goto errout;
+ }
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+ return bh;
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
+ set_buffer_verified(bh);
+ /*
+ * If this is a leaf block, cache all of its entries
+ */
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+ struct ext4_extent_header *eh = ext_block_hdr(bh);
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev,
+ lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+ }
+ return bh;
+errout:
+ put_bh(bh);
+ return ERR_PTR(err);
+
+}
+
+#define read_extent_tree_block(inode, pblk, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+ (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ up_read(&ei->i_data_sem);
+ return -ENOMEM;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
}
#ifdef EXT_DEBUG
@@ -419,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
} else if (path->p_ext) {
ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
@@ -445,14 +647,48 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
ext_debug("\n");
}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t newblock, int level)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+
+ if (depth != level) {
+ struct ext4_extent_idx *idx;
+ idx = path[level].p_idx;
+ while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+ ext_debug("%d: move %d:%llu in new index %llu\n", level,
+ le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx),
+ newblock);
+ idx++;
+ }
+
+ return;
+ }
+
+ ex = path[depth].p_ext;
+ while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ newblock);
+ ex++;
+ }
+}
+
#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -496,7 +732,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
}
path->p_idx = l - 1;
- ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
@@ -567,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode,
ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -599,17 +835,17 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
- ext4_ext_invalidate_cache(inode);
return 0;
}
struct ext4_ext_path *
ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
short int depth, i, ppos = 0, alloc = 0;
+ int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -628,8 +864,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
i = depth;
/* walk through the tree */
while (i) {
- int need_to_validate = 0;
-
ext_debug("depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -638,31 +872,24 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh))
+ bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+ flags);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto err;
- if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
- goto err;
- }
- /* validate the extent entries */
- need_to_validate = 1;
}
+
eh = ext_block_hdr(bh);
ppos++;
if (unlikely(ppos > depth)) {
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
+ ret = -EIO;
goto err;
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
- i--;
-
- if (need_to_validate && ext4_ext_check(inode, eh, i))
- goto err;
}
path[ppos].p_depth = i;
@@ -683,7 +910,7 @@ err:
ext4_ext_drop_refs(path);
if (alloc)
kfree(path);
- return ERR_PTR(-EIO);
+ return ERR_PTR(ret);
}
/*
@@ -708,42 +935,44 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
logical, le32_to_cpu(curp->p_idx->ei_block));
return -EIO;
}
- len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+
+ if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+ >= le16_to_cpu(curp->p_hdr->eh_max))) {
+ EXT4_ERROR_INODE(inode,
+ "eh_entries %d >= eh_max %d!",
+ le16_to_cpu(curp->p_hdr->eh_entries),
+ le16_to_cpu(curp->p_hdr->eh_max));
+ return -EIO;
+ }
+
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
- len = (len - 1) * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d after: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- (curp->p_idx + 1), (curp->p_idx + 2));
- memmove(curp->p_idx + 2, curp->p_idx + 1, len);
- }
+ ext_debug("insert new index %d after: %llu\n", logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- len = len * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d before: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- curp->p_idx, (curp->p_idx + 1));
- memmove(curp->p_idx + 1, curp->p_idx, len);
+ ext_debug("insert new index %d before: %llu\n", logical, ptr);
ix = curp->p_idx;
}
+ len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+ BUG_ON(len < 0);
+ if (len > 0) {
+ ext_debug("insert new index %d: "
+ "move %d indices from 0x%p to 0x%p\n",
+ logical, len, ix, ix + 1);
+ memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+ }
+
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EIO;
+ }
+
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
- if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
- > le16_to_cpu(curp->p_hdr->eh_max))) {
- EXT4_ERROR_INODE(inode,
- "logical %d == ei_block %d!",
- logical, le32_to_cpu(curp->p_idx->ei_block));
- return -EIO;
- }
if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
return -EIO;
@@ -766,14 +995,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
* - initializes subtree
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext, int at)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx;
- struct ext4_extent *ex;
int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock;
__le32 border;
@@ -821,7 +1050,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
- newext, &err);
+ newext, &err, flags);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -835,8 +1064,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -850,7 +1079,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
- ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -862,28 +1090,16 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
/* start copy from next extent */
- /* TODO: we could do it by single memmove */
- m = 0;
- path[depth].p_ext++;
- while (path[depth].p_ext <=
- EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
- le32_to_cpu(path[depth].p_ext->ee_block),
- ext4_ext_pblock(path[depth].p_ext),
- ext4_ext_is_uninitialized(path[depth].p_ext),
- ext4_ext_get_actual_len(path[depth].p_ext),
- newblock);
- /*memmove(ex++, path[depth].p_ext++,
- sizeof(struct ext4_extent));
- neh->eh_entries++;*/
- path[depth].p_ext++;
- m++;
- }
+ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+ ext4_ext_show_move(inode, path, newblock, depth);
if (m) {
- memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+ struct ext4_extent *ex;
+ ex = EXT_FIRST_EXTENT(neh);
+ memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -921,8 +1137,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -942,12 +1158,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
- /* copy indexes */
- m = 0;
- path[i].p_idx++;
- ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
- EXT_MAX_INDEX(path[i].p_hdr));
+ /* move remainder of path[i] to the new index block */
if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
EXT_LAST_INDEX(path[i].p_hdr))) {
EXT4_ERROR_INODE(inode,
@@ -956,23 +1168,17 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = -EIO;
goto cleanup;
}
- while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", i,
- le32_to_cpu(path[i].p_idx->ei_block),
- ext4_idx_pblock(path[i].p_idx),
- newblock);
- /*memmove(++fidx, path[i].p_idx++,
- sizeof(struct ext4_extent_idx));
- neh->eh_entries++;
- BUG_ON(neh->eh_entries > neh->eh_max);*/
- path[i].p_idx++;
- m++;
- }
+ /* start copy indexes */
+ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+ ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ EXT_MAX_INDEX(path[i].p_hdr));
+ ext4_ext_show_move(inode, path, newblock, i);
if (m) {
- memmove(++fidx, path[i].p_idx - m,
+ memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1012,7 +1218,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
- ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+ ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
EXT4_FREE_BLOCKS_METADATA);
}
}
@@ -1030,25 +1236,22 @@ cleanup:
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_extent *newext)
{
- struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
struct buffer_head *bh;
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, NULL,
+ newext, &err, flags);
if (newblock == 0)
return err;
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
- ext4_std_error(inode->i_sb, err);
- return err;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -1058,7 +1261,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
}
/* move top-level index/leaf into new block */
- memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+ memmove(bh->b_data, EXT4_I(inode)->i_data,
+ sizeof(EXT4_I(inode)->i_data));
/* set size of new block */
neh = ext_block_hdr(bh);
@@ -1069,6 +1273,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
else
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1076,32 +1281,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
if (err)
goto out;
- /* create index in new top-level index: num,max,pointer */
- err = ext4_ext_get_access(handle, inode, curp);
- if (err)
- goto out;
-
- curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
- curp->p_hdr->eh_entries = cpu_to_le16(1);
- curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
- if (path[0].p_hdr->eh_depth)
- curp->p_idx->ei_block =
- EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
- else
- curp->p_idx->ei_block =
- EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
- ext4_idx_store_pblock(curp->p_idx, newblock);
-
+ /* Update top-level index: num,max,pointer */
neh = ext_inode_hdr(inode);
+ neh->eh_entries = cpu_to_le16(1);
+ ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+ if (neh->eh_depth == 0) {
+ /* Root extent block becomes index block */
+ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+ EXT_FIRST_INDEX(neh)->ei_block =
+ EXT_FIRST_EXTENT(neh)->ee_block;
+ }
ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(path->p_depth + 1);
- err = ext4_ext_dirty(handle, inode, curp);
+ le16_add_cpu(&neh->eh_depth, 1);
+ ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1114,8 +1310,10 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int mb_flags,
+ unsigned int gb_flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -1135,7 +1333,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, path, newext, i);
+ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
goto out;
@@ -1143,12 +1341,12 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
if (err)
goto out;
@@ -1156,7 +1354,7 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1220,9 +1418,9 @@ static int ext4_ext_search_left(struct inode *inode,
if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
- ix != NULL ? ix->ei_block : 0,
+ ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
depth);
return -EIO;
}
@@ -1245,13 +1443,14 @@ static int ext4_ext_search_left(struct inode *inode,
/*
* search the closest allocated block to the right for *logical
* and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
* returns 0 at @phys
* return value contains 0 (success) or error code
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
- ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ ext4_lblk_t *logical, ext4_fsblk_t *phys,
+ struct ext4_extent **ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1293,9 +1492,7 @@ static int ext4_ext_search_right(struct inode *inode,
return -EIO;
}
}
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext4_ext_pblock(ex);
- return 0;
+ goto found_extent;
}
if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1308,9 +1505,7 @@ static int ext4_ext_search_right(struct inode *inode,
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
/* next allocated block in this leaf */
ex++;
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext4_ext_pblock(ex);
- return 0;
+ goto found_extent;
}
/* go up and search for index to the right */
@@ -1330,38 +1525,34 @@ got_index:
ix++;
block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
- eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
- put_bh(bh);
- return -EIO;
- }
+ bh = read_extent_tree_block(inode, block,
+ path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
- put_bh(bh);
- return -EIO;
- }
ex = EXT_FIRST_EXTENT(eh);
+found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- put_bh(bh);
+ *ret_ex = ex;
+ if (bh)
+ put_bh(bh);
return 0;
}
/*
* ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
* NOTE: it considers block number from index entry as
* allocated block. Thus, index entries have to be consistent
* with leaves.
@@ -1375,12 +1566,13 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth = path->p_depth;
if (depth == 0 && path->p_ext == NULL)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
while (depth >= 0) {
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext !=
+ if (path[depth].p_ext &&
+ path[depth].p_ext !=
EXT_LAST_EXTENT(path[depth].p_hdr))
return le32_to_cpu(path[depth].p_ext[1].ee_block);
} else {
@@ -1392,15 +1584,14 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth--;
}
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}
/*
* ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
*/
-static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
- struct ext4_ext_path *path)
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
{
int depth;
@@ -1409,7 +1600,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
/* zero-tree has no leaf blocks at all */
if (depth == 0)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
/* go to index block */
depth--;
@@ -1422,7 +1613,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
depth--;
}
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}
/*
@@ -1492,20 +1683,17 @@ int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- unsigned short ext1_ee_len, ext2_ee_len, max_len;
+ unsigned short ext1_ee_len, ext2_ee_len;
/*
- * Make sure that either both extents are uninitialized, or
- * both are _not_.
+ * Make sure that both extents are initialized. We don't merge
+ * unwritten extents so that we can be sure that end_io code has
+ * the extent that was written properly split out and conversion to
+ * initialized is trivial.
*/
- if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
return 0;
- if (ext4_ext_is_uninitialized(ex1))
- max_len = EXT_UNINIT_MAX_LEN;
- else
- max_len = EXT_INIT_MAX_LEN;
-
ext1_ee_len = ext4_ext_get_actual_len(ex1);
ext2_ee_len = ext4_ext_get_actual_len(ex2);
@@ -1518,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (ext1_ee_len + ext2_ee_len > max_len)
+ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+ return 0;
+ if (ext4_ext_is_unwritten(ex1) &&
+ (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+ atomic_read(&EXT4_I(inode)->i_unwritten) ||
+ (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1537,14 +1730,13 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
* 1 if they got merged.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex)
{
struct ext4_extent_header *eh;
unsigned int depth, len;
- int merge_done = 0;
- int uninitialized = 0;
+ int merge_done = 0, unwritten;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1554,12 +1746,11 @@ static int ext4_ext_try_to_merge(struct inode *inode,
if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
break;
/* merge with next extent! */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(ex + 1));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
if (ex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1577,6 +1768,76 @@ static int ext4_ext_try_to_merge(struct inode *inode,
}
/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ size_t s;
+ unsigned max_root = ext4_ext_space_root(inode, 0);
+ ext4_fsblk_t blk;
+
+ if ((path[0].p_depth != 1) ||
+ (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+ (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+ return;
+
+ /*
+ * We need to modify the block allocation bitmap and the block
+ * group descriptor to release the extent tree block. If we
+ * can't get the journal credits, give up.
+ */
+ if (ext4_journal_extend(handle, 2))
+ return;
+
+ /*
+ * Copy the extent data up to the inode
+ */
+ blk = ext4_idx_pblock(path[0].p_idx);
+ s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+ sizeof(struct ext4_extent_idx);
+ s += sizeof(struct ext4_extent_header);
+
+ memcpy(path[0].p_hdr, path[1].p_hdr, s);
+ path[0].p_depth = 0;
+ path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+ (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+ path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+ brelse(path[1].p_bh);
+ ext4_free_blocks(handle, inode, NULL, blk, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+ EXT4_FREE_BLOCKS_RESERVE);
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static void ext4_ext_try_to_merge(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex) {
+ struct ext4_extent_header *eh;
+ unsigned int depth;
+ int merge_done = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ if (ex > EXT_FIRST_EXTENT(eh))
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+ if (!merge_done)
+ (void) ext4_ext_try_to_merge_right(inode, path, ex);
+
+ ext4_ext_try_to_merge_up(handle, inode, path);
+}
+
+/*
* check if a portion of the "newext" extent overlaps with an
* existing extent.
*
@@ -1584,7 +1845,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
* such that there will be no overlap, and then returns 1.
* If there is no overlap found, it returns 0.
*/
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+ struct inode *inode,
struct ext4_extent *newext,
struct ext4_ext_path *path)
{
@@ -1597,7 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
depth = ext_depth(inode);
if (!path[depth].p_ext)
goto out;
- b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+ b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
/*
* get the next allocated block if the extent in the path
@@ -1605,13 +1867,14 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
*/
if (b2 < b1) {
b2 = ext4_ext_next_allocated_block(path);
- if (b2 == EXT_MAX_BLOCK)
+ if (b2 == EXT_MAX_BLOCKS)
goto out;
+ b2 = EXT4_LBLK_CMASK(sbi, b2);
}
/* check for wrap through zero on extent logical start block*/
if (b1 + len1 < b1) {
- len1 = EXT_MAX_BLOCK - b1;
+ len1 = EXT_MAX_BLOCKS - b1;
newext->ee_len = cpu_to_le16(len1);
ret = 1;
}
@@ -1633,7 +1896,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext, int flag)
+ struct ext4_extent *newext, int gb_flags)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1641,7 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *npath = NULL;
int depth, len, err;
ext4_lblk_t next;
- unsigned uninitialized = 0;
+ int mb_flags = 0, unwritten;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1649,42 +1912,88 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
return -EIO;
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
- && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
- ext4_ext_get_actual_len(ex),
- ext4_ext_pblock(ex));
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- return err;
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
/*
- * ext4_can_extents_be_merged should have checked that either
- * both extents are uninitialized, or both aren't. Thus we
- * need to check only one of them here.
+ * Try to see whether we should rather test the extent on
+ * right from ex, or from the left of ex. This is because
+ * ext4_ext_find_extent() can return either extent on the
+ * left, or on the right from the searched position. This
+ * will make merging more effective.
*/
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
- ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ if (ex < EXT_LAST_EXTENT(eh) &&
+ (le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex) <
+ le32_to_cpu(newext->ee_block))) {
+ ex += 1;
+ goto prepend;
+ } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+ (le32_to_cpu(newext->ee_block) +
+ ext4_ext_get_actual_len(newext) <
+ le32_to_cpu(ex->ee_block)))
+ ex -= 1;
+
+ /* Try to append newex to the ex */
+ if (ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+ unwritten = ext4_ext_is_unwritten(ex);
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
- eh = path[depth].p_hdr;
- nearex = ex;
- goto merge;
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+prepend:
+ /* Try to prepend newex to the ex */
+ if (ext4_can_extents_be_merged(inode, newext, ex)) {
+ ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ unwritten = ext4_ext_is_unwritten(ex);
+ ex->ee_block = newext->ee_block;
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
}
-repeat:
depth = ext_depth(inode);
eh = path[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1692,21 +2001,22 @@ repeat:
/* probably next leaf has space for us? */
fex = EXT_LAST_EXTENT(eh);
- next = ext4_ext_next_leaf_block(inode, path);
- if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
- && next != EXT_MAX_BLOCK) {
- ext_debug("next leaf block - %d\n", next);
+ next = EXT_MAX_BLOCKS;
+ if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
+ next = ext4_ext_next_leaf_block(path);
+ if (next != EXT_MAX_BLOCKS) {
+ ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
- ext_debug("next leaf isnt full(%d)\n",
+ ext_debug("next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
path = npath;
- goto repeat;
+ goto has_space;
}
ext_debug("next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1716,7 +2026,10 @@ repeat:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ mb_flags = EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -1731,94 +2044,103 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext));
- path[depth].p_ext = EXT_FIRST_EXTENT(eh);
- } else if (le32_to_cpu(newext->ee_block)
+ nearex = EXT_FIRST_EXTENT(eh);
+ } else {
+ if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
-/* BUG_ON(newext->ee_block == nearex->ee_block); */
- if (nearex != EXT_LAST_EXTENT(eh)) {
- len = EXT_MAX_EXTENT(eh) - nearex;
- len = (len - 1) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
+ /* Insert after */
+ ext_debug("insert %u:%llu:[%d]%d before: "
+ "nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 2, nearex + 1, len);
+ nearex);
+ nearex++;
+ } else {
+ /* Insert before */
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ ext_debug("insert %u:%llu:[%d]%d after: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ }
+ len = EXT_LAST_EXTENT(eh) - nearex + 1;
+ if (len > 0) {
+ ext_debug("insert %u:%llu:[%d]%d: "
+ "move %d extents from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ len, nearex, nearex + 1);
+ memmove(nearex + 1, nearex,
+ len * sizeof(struct ext4_extent));
}
- path[depth].p_ext = nearex + 1;
- } else {
- BUG_ON(newext->ee_block == nearex->ee_block);
- len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
- le32_to_cpu(newext->ee_block),
- ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 1, nearex, len);
- path[depth].p_ext = nearex;
}
le16_add_cpu(&eh->eh_entries, 1);
- nearex = path[depth].p_ext;
+ path[depth].p_ext = nearex;
nearex->ee_block = newext->ee_block;
ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
nearex->ee_len = newext->ee_len;
merge:
- /* try to merge extents to the right */
- if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, nearex);
+ /* try to merge extents */
+ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(handle, inode, path, nearex);
- /* try to merge extents to the left */
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto cleanup;
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
if (npath) {
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_invalidate_cache(inode);
return err;
}
-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
- ext4_lblk_t num, ext_prepare_callback func,
- void *cbdata)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache cbex;
struct ext4_extent *ex;
- ext4_lblk_t next, start = 0, end = 0;
+ struct extent_status es;
+ ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
- int depth, exists, err = 0;
-
- BUG_ON(func == NULL);
- BUG_ON(inode == NULL);
+ int exists, depth = 0, err = 0;
+ unsigned int flags = 0;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
- while (block < last && block != EXT_MAX_BLOCK) {
+ while (block < last && block != EXT_MAX_BLOCKS) {
num = last - block;
/* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem);
- path = ext4_ext_find_extent(inode, block, path);
- up_read(&EXT4_I(inode)->i_data_sem);
+
+ if (path && ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
+ path = ext4_ext_find_extent(inode, block, path, 0);
if (IS_ERR(path)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
path = NULL;
break;
@@ -1826,13 +2148,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
depth = ext_depth(inode);
if (unlikely(path[depth].p_hdr == NULL)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
err = -EIO;
break;
}
ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path);
+ ext4_ext_drop_refs(path);
+ flags = 0;
exists = 0;
if (!ex) {
/* there is no extent yet, so try to allocate
@@ -1869,42 +2194,75 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
BUG_ON(end <= start);
if (!exists) {
- cbex.ec_block = start;
- cbex.ec_len = end - start;
- cbex.ec_start = 0;
- cbex.ec_type = EXT4_EXT_CACHE_GAP;
+ es.es_lblk = start;
+ es.es_len = end - start;
+ es.es_pblk = 0;
} else {
- cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = ext4_ext_get_actual_len(ex);
- cbex.ec_start = ext4_ext_pblock(ex);
- cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+ es.es_lblk = le32_to_cpu(ex->ee_block);
+ es.es_len = ext4_ext_get_actual_len(ex);
+ es.es_pblk = ext4_ext_pblock(ex);
+ if (ext4_ext_is_unwritten(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
}
- if (unlikely(cbex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
- err = -EIO;
- break;
+ /*
+ * Find delayed extent and update es accordingly. We call
+ * it even in !exists case to find out whether es is the
+ * last existing extent or not.
+ */
+ next_del = ext4_find_delayed_extent(inode, &es);
+ if (!exists && next_del) {
+ exists = 1;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
}
- err = func(inode, path, &cbex, ex, cbdata);
- ext4_ext_drop_refs(path);
+ up_read(&EXT4_I(inode)->i_data_sem);
- if (err < 0)
+ if (unlikely(es.es_len == 0)) {
+ EXT4_ERROR_INODE(inode, "es.es_len == 0");
+ err = -EIO;
break;
+ }
- if (err == EXT_REPEAT)
- continue;
- else if (err == EXT_BREAK) {
- err = 0;
- break;
+ /*
+ * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+ * we need to check next == EXT_MAX_BLOCKS because it is
+ * possible that an extent is with unwritten and delayed
+ * status due to when an extent is delayed allocated and
+ * is allocated by fallocate status tree will track both of
+ * them in a extent.
+ *
+ * So we could return a unwritten and delayed extent, and
+ * its block is equal to 'next'.
+ */
+ if (next == next_del && next == EXT_MAX_BLOCKS) {
+ flags |= FIEMAP_EXTENT_LAST;
+ if (unlikely(next_del != EXT_MAX_BLOCKS ||
+ next != EXT_MAX_BLOCKS)) {
+ EXT4_ERROR_INODE(inode,
+ "next extent == %u, next "
+ "delalloc extent = %u",
+ next, next_del);
+ err = -EIO;
+ break;
+ }
}
- if (ext_depth(inode) != depth) {
- /* depth was changed. we have to realloc path */
- kfree(path);
- path = NULL;
+ if (exists) {
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
+ flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
}
- block = cbex.ec_block + cbex.ec_len;
+ block = es.es_lblk + es.es_len;
}
if (path) {
@@ -1915,21 +2273,6 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
return err;
}
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
- __u32 len, ext4_fsblk_t start, int type)
-{
- struct ext4_ext_cache *cex;
- BUG_ON(len == 0);
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- cex = &EXT4_I(inode)->i_cached_extent;
- cex->ec_type = type;
- cex->ec_block = block;
- cex->ec_len = len;
- cex->ec_start = start;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-}
-
/*
* ext4_ext_put_gap_in_cache:
* calculate boundaries of the gap that the requested block fits into
@@ -1940,15 +2283,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len;
- ext4_lblk_t lblock;
+ unsigned long len = 0;
+ ext4_lblk_t lblock = 0;
struct ext4_extent *ex;
ex = path[depth].p_ext;
if (ex == NULL) {
- /* there is no extent yet, so gap is [0;-] */
- lblock = 0;
- len = EXT_MAX_BLOCK;
+ /*
+ * there is no extent yet, so gap is [0;-] and we
+ * don't cache it
+ */
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -1957,6 +2301,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -1970,62 +2317,29 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else {
- lblock = len = 0;
BUG();
}
ext_debug(" -> %u:%lu\n", lblock, len);
- ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
-}
-
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache *cex;
- int ret = EXT4_EXT_CACHE_NO;
-
- /*
- * We borrow i_block_reservation_lock to protect i_cached_extent
- */
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- cex = &EXT4_I(inode)->i_cached_extent;
-
- /* has cache valid data? */
- if (cex->ec_type == EXT4_EXT_CACHE_NO)
- goto errout;
-
- BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
- cex->ec_type != EXT4_EXT_CACHE_EXTENT);
- if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
- ext_debug("%u cached by %u:%u:%llu\n",
- block,
- cex->ec_block, cex->ec_len, cex->ec_start);
- ret = cex->ec_type;
- }
-errout:
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return ret;
}
/*
* ext4_ext_rm_idx:
* removes index from the index block.
- * It's used in truncate case only, thus all requests are for
- * last index in the block only.
*/
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int depth)
{
int err;
ext4_fsblk_t leaf;
/* free index block */
- path--;
+ depth--;
+ path = path + depth;
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
@@ -2034,13 +2348,35 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path);
if (err)
return err;
+
+ if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+ int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ len *= sizeof(struct ext4_extent_idx);
+ memmove(path->p_idx, path->p_idx + 1, len);
+ }
+
le16_add_cpu(&path->p_hdr->eh_entries, -1);
err = ext4_ext_dirty(handle, inode, path);
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
- ext4_free_blocks(handle, inode, 0, leaf, 1,
+ trace_ext4_ext_rm_idx(inode, leaf);
+
+ ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ while (--depth >= 0) {
+ if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ break;
+ path--;
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err)
+ break;
+ path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path);
+ if (err)
+ break;
+ }
return err;
}
@@ -2067,7 +2403,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
* need to account for leaf block credit
*
* bitmaps and block group descriptor blocks
- * and other metadat blocks still need to be
+ * and other metadata blocks still need to be
* accounted.
*/
/* 1 bitmap, 1 block group descriptor */
@@ -2080,22 +2416,26 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
}
/*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
*
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
*
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
*/
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth = ext_depth(inode);
+ int depth;
+
+ /* If we are converting the inline data, only one is needed here. */
+ if (ext4_has_inline_data(inode))
+ return 1;
- if (chunk)
+ depth = ext_depth(inode);
+
+ if (extents <= 1)
index = depth * 2;
else
index = depth * 3;
@@ -2103,15 +2443,49 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
return index;
}
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
- struct ext4_extent *ex,
- ext4_lblk_t from, ext4_lblk_t to)
+ struct ext4_extent *ex,
+ long long *partial_cluster,
+ ext4_lblk_t from, ext4_lblk_t to)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- int flags = EXT4_FREE_BLOCKS_FORGET;
+ ext4_fsblk_t pblk;
+ int flags = get_default_free_blocks_flags(inode);
+
+ /*
+ * For bigalloc file systems, we never free a partial cluster
+ * at the beginning of the extent. Instead, we make a note
+ * that we tried freeing the cluster, and check to see if we
+ * need to free it on a subsequent call to ext4_remove_blocks,
+ * or at the end of the ext4_truncate() operation.
+ */
+ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+ trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+ /*
+ * If we have a partial cluster, and it's different from the
+ * cluster of the last block, we need to explicitly free the
+ * partial cluster here.
+ */
+ pblk = ext4_ext_pblock(ex) + ee_len - 1;
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2131,40 +2505,84 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
- ext4_fsblk_t start;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
- start = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, 0, start, num, flags);
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+ /*
+ * If the block range to be freed didn't start at the
+ * beginning of a cluster, and we removed the entire
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
+ */
+ unaligned = EXT4_PBLK_COFF(sbi, pblk);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
+ *partial_cluster = EXT4_B2C(sbi, pblk);
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
+ *partial_cluster = 0;
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode: The files inode
+ * @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
+ * @start: The first block to remove
+ * @end: The last block to remove
+ */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_lblk_t start)
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
+ ext4_lblk_t start, ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
struct ext4_extent_header *eh;
- ext4_lblk_t a, b, block;
+ ext4_lblk_t a, b;
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
- unsigned uninitialized = 0;
+ unsigned unwritten = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
- ext_debug("truncate since %u in leaf\n", start);
+ ext_debug("truncate since %u in leaf to %u\n", start, end);
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
@@ -2173,51 +2591,85 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
return -EIO;
}
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
+ /*
+ * If we're starting with an extent other than the last one in the
+ * node, we need to see if it shares a cluster with the extent to
+ * the right (towards the end of the file). If its leftmost cluster
+ * is this extent's rightmost cluster and it is not cluster aligned,
+ * we'll mark it as a partial that is not to be deallocated.
+ */
+
+ if (ex != EXT_LAST_EXTENT(eh)) {
+ ext4_fsblk_t current_pblk, right_pblk;
+ long long current_cluster, right_cluster;
+
+ current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+ right_pblk = ext4_ext_pblock(ex + 1);
+ right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+ if (current_cluster == right_cluster &&
+ EXT4_PBLK_COFF(sbi, right_pblk))
+ *partial_cluster = -right_cluster;
+ }
+
+ trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ if (ext4_ext_is_unwritten(ex))
+ unwritten = 1;
else
- uninitialized = 0;
+ unwritten = 0;
ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
- uninitialized, ex_ee_len);
+ unwritten, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
- ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
ext_debug(" border %u:%u\n", a, b);
- if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
- block = 0;
- num = 0;
- BUG();
+ /* If this extent is beyond the end of the hole, skip it */
+ if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (EXT4_PBLK_COFF(sbi, pblk))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ continue;
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ EXT4_ERROR_INODE(inode,
+ "can not handle truncate %u:%u "
+ "on extent %u:%u",
+ start, end, ex_ee_block,
+ ex_ee_block + ex_ee_len - 1);
+ err = -EIO;
+ goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
- block = ex_ee_block;
- num = a - block;
- } else if (b != ex_ee_block + ex_ee_len - 1) {
- /* remove head of the extent */
- block = a;
- num = b - a;
- /* there is no "make a hole" API yet */
- BUG();
+ num = a - ex_ee_block;
} else {
/* remove whole extent: excellent! */
- block = ex_ee_block;
num = 0;
- BUG_ON(a != ex_ee_block);
- BUG_ON(b != ex_ee_block + ex_ee_len - 1);
}
-
/*
* 3 for leaf, sb, and inode plus 2 (bmap and group
* descriptor) for each block group; assume two block
@@ -2239,30 +2691,49 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
- err = ext4_remove_blocks(handle, inode, ex, a, b);
+ err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+ a, b);
if (err)
goto out;
- if (num == 0) {
+ if (num == 0)
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- le16_add_cpu(&eh->eh_entries, -1);
- }
- ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
/*
- * Do not mark uninitialized if all the blocks in the
+ * Do not mark unwritten if all the blocks in the
* extent have been removed.
*/
- if (uninitialized && num)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten && num)
+ ext4_ext_mark_unwritten(ex);
+ /*
+ * If the extent was completely released,
+ * we need to remove it from the leaf
+ */
+ if (num == 0) {
+ if (end != EXT_MAX_BLOCKS - 1) {
+ /*
+ * For hole punching, we need to scoot all the
+ * extents up when an extent is removed so that
+ * we dont have blank extents in the middle
+ */
+ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+ sizeof(struct ext4_extent));
+
+ /* Now get rid of the one at the end */
+ memset(EXT_LAST_EXTENT(eh), 0,
+ sizeof(struct ext4_extent));
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ } else if (*partial_cluster > 0)
+ *partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto out;
- ext_debug("new extent: %u:%u:%llu\n", block, num,
+ ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2272,10 +2743,30 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (correct_index && eh->eh_entries)
err = ext4_ext_correct_indexes(handle, inode, path);
+ /*
+ * If there's a partial cluster and at least one extent remains in
+ * the leaf, free the partial cluster if it isn't shared with the
+ * current extent. If there's a partial cluster and no extents
+ * remain in the leaf, it can't be freed here. It can only be
+ * freed when it's possible to determine if it's not shared with
+ * any other extent - when the next leaf is processed or when space
+ * removal is complete.
+ */
+ if (*partial_cluster > 0 && eh->eh_entries &&
+ (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+ *partial_cluster)) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
- err = ext4_ext_rm_idx(handle, inode, path + depth);
+ err = ext4_ext_rm_idx(handle, inode, path, depth);
out:
return err;
@@ -2302,46 +2793,122 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
- struct ext4_ext_path *path;
+ struct ext4_ext_path *path = NULL;
+ long long partial_cluster = 0;
handle_t *handle;
- int i, err;
+ int i = 0, err = 0;
- ext_debug("truncate since %u\n", start);
+ ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, depth + 1);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
again:
- ext4_ext_invalidate_cache(inode);
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
+ * Check if we are removing extents inside the extent tree. If that
+ * is the case, we are going to punch a hole inside the extent tree
+ * so we have to check whether we need to split the extent covering
+ * the last block to remove so we can easily remove the part of it
+ * in ext4_ext_rm_leaf().
+ */
+ if (end < EXT_MAX_BLOCKS - 1) {
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+ if (IS_ERR(path)) {
+ ext4_journal_stop(handle);
+ return PTR_ERR(path);
+ }
+ depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
+ ex = path[depth].p_ext;
+ if (!ex) {
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
+ }
+
+ ee_block = le32_to_cpu(ex->ee_block);
+
+ /*
+ * See if the last block is inside the extent, if so split
+ * the extent at 'end' block so we can easily remove the
+ * tail of the first part of the split extent in
+ * ext4_ext_rm_leaf().
+ */
+ if (end >= ee_block &&
+ end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ int split_flag = 0;
+
+ if (ext4_ext_is_unwritten(ex))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+
+ /*
+ * Split the extent in two so that 'end' is the last
+ * block in the first new extent. Also we should not
+ * fail removing space due to ENOSPC so try to use
+ * reserved block if that happens.
+ */
+ err = ext4_split_extent_at(handle, inode, path,
+ end + 1, split_flag,
+ EXT4_EX_NOCACHE |
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
+
+ if (err < 0)
+ goto out;
+ }
+ }
+ /*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
depth = ext_depth(inode);
- path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
- if (path == NULL) {
- ext4_journal_stop(handle);
- return -ENOMEM;
- }
- path[0].p_depth = depth;
- path[0].p_hdr = ext_inode_hdr(inode);
- if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
- err = -EIO;
- goto out;
+ if (path) {
+ int k = i = depth;
+ while (--k > 0)
+ path[k].p_block =
+ le16_to_cpu(path[k].p_hdr->eh_entries)+1;
+ } else {
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
+ path[0].p_depth = depth;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ i = 0;
+
+ if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
+ err = -EIO;
+ goto out;
+ }
}
- i = err = 0;
+ err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
/* this is leaf block */
- err = ext4_ext_rm_leaf(handle, inode, path, start);
+ err = ext4_ext_rm_leaf(handle, inode, path,
+ &partial_cluster, start,
+ end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2376,21 +2943,21 @@ again:
ext_debug("move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
- if (!bh) {
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(bh)) {
/* should we reset i_size? */
- err = -EIO;
+ err = PTR_ERR(bh);
break;
}
+ /* Yield here to deal with large extent trees.
+ * Should be a no-op if we did IO above. */
+ cond_resched();
if (WARN_ON(i + 1 > depth)) {
err = -EIO;
break;
}
- if (ext4_ext_check(inode, ext_block_hdr(bh),
- depth - i - 1)) {
- err = -EIO;
- break;
- }
path[i + 1].p_bh = bh;
/* save actual number of indexes since this
@@ -2403,7 +2970,7 @@ again:
/* index is empty, remove it;
* handle must be already prepared by the
* truncatei_leaf() */
- err = ext4_ext_rm_idx(handle, inode, path + i);
+ err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
@@ -2413,6 +2980,21 @@ again:
}
}
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
+
+ /* If we still have something in the partial cluster and we have removed
+ * even the first extent, then we should free the blocks in the partial
+ * cluster as well. */
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(EXT4_SB(sb), partial_cluster),
+ EXT4_SB(sb)->s_cluster_ratio, flags);
+ partial_cluster = 0;
+ }
+
/* TODO: flexible tree reduction should be here */
if (path->p_hdr->eh_entries == 0) {
/*
@@ -2430,8 +3012,10 @@ again:
out:
ext4_ext_drop_refs(path);
kfree(path);
- if (err == -EAGAIN)
+ if (err == -EAGAIN) {
+ path = NULL;
goto again;
+ }
ext4_journal_stop(handle);
return err;
@@ -2448,17 +3032,17 @@ void ext4_ext_init(struct super_block *sb)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
- printk(KERN_INFO "EXT4-fs: file extents enabled");
+ printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
- printk(", aggressive tests");
+ ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
- printk(", check binsearch");
+ ", check binsearch"
#endif
#ifdef EXTENTS_STATS
- printk(", stats");
+ ", stats"
#endif
- printk("\n");
+ "\n");
#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2488,6 +3072,23 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ if (ee_len == 0)
+ return 0;
+
+ return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
+}
+
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
@@ -2505,529 +3106,673 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
return ret;
}
-#define EXT4_EXT_ZERO_LEN 7
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ * the states(init or unwritten) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ * a> the extent are splitted into two extent.
+ * b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags)
+{
+ ext4_fsblk_t newblock;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex, newex, orig_ex, zero_ex;
+ struct ext4_extent *ex2 = NULL;
+ unsigned int ee_len, depth;
+ int err = 0;
+
+ BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+ (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
+ ext_debug("ext4_split_extents_at: inode %lu, logical"
+ "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+ ext4_ext_show_leaf(inode, path);
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ newblock = split - ee_block + ext4_ext_pblock(ex);
+
+ BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+ BUG_ON(!ext4_ext_is_unwritten(ex) &&
+ split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (split == ee_block) {
+ /*
+ * case b: block @split is the block that the extent begins with
+ * then we just change the state of the extent, and splitting
+ * is not needed.
+ */
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ goto out;
+ }
+
+ /* case a */
+ memcpy(&orig_ex, ex, sizeof(orig_ex));
+ ex->ee_len = cpu_to_le16(split - ee_block);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+ ext4_ext_mark_unwritten(ex);
+
+ /*
+ * path may lead to new leaf, not to original leaf any more
+ * after ext4_ext_insert_extent() returns,
+ */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto fix_extent_len;
+
+ ex2 = &newex;
+ ex2->ee_block = cpu_to_le32(split);
+ ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
+ ext4_ext_store_pblock(ex2, newblock);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex2);
+
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+ if (split_flag & EXT4_EXT_DATA_VALID1) {
+ err = ext4_ext_zeroout(inode, ex2);
+ zero_ex.ee_block = ex2->ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(ex2));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex2));
+ } else {
+ err = ext4_ext_zeroout(inode, ex);
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(ex));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ }
+ } else {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ zero_ex.ee_block = orig_ex.ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(&orig_ex));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(&orig_ex));
+ }
+
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ goto fix_extent_len;
+
+ /* update extent status tree */
+ err = ext4_zeroout_es(inode, &zero_ex);
+
+ goto out;
+ } else if (err)
+ goto fix_extent_len;
+
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+
+fix_extent_len:
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required
+ * b> Splits in two extents: Split is happening at either end of the extent
+ * c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags)
+{
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ int err = 0;
+ int unwritten;
+ int split_flag1, flags1;
+ int allocated = map->m_len;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ unwritten = ext4_ext_is_unwritten(ex);
+
+ if (map->m_lblk + map->m_len < ee_block + ee_len) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
+ flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ if (unwritten)
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+ if (split_flag & EXT4_EXT_DATA_VALID2)
+ split_flag1 |= EXT4_EXT_DATA_VALID1;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk + map->m_len, split_flag1, flags1);
+ if (err)
+ goto out;
+ } else {
+ allocated = ee_len - (map->m_lblk - ee_block);
+ }
+ /*
+ * Update path is required because previous ext4_split_extent_at() may
+ * result in split of original leaf or extent zeroout.
+ */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
+ split_flag1 = 0;
+
+ if (map->m_lblk >= ee_block) {
+ split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+ if (unwritten) {
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
+ split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNWRIT2);
+ }
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk, split_flag1, flags);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_show_leaf(inode, path);
+out:
+ return err ? err : allocated;
+}
+
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
- * uninitialized).
+ * to an unwritten extent. It may result in splitting the unwritten
+ * extent into multiple extents (up to three - one initialized and two
+ * unwritten).
* There are three possibilities:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is unwritten.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path,
+ int flags)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
+ struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
+ struct ext4_map_blocks split_map;
+ struct ext4_extent zero_ex;
+ struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
+ unsigned int ee_len, depth, map_len = map->m_len;
+ int allocated = 0, max_zeroout = 0;
int err = 0;
- int ret = 0;
- int may_zeroout;
+ int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ (unsigned long long)map->m_lblk, map_len);
+ sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
- if (eof_block < map->m_lblk + map->m_len)
- eof_block = map->m_lblk + map->m_len;
+ if (eof_block < map->m_lblk + map_len)
+ eof_block = map->m_lblk + map_len;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
+ zero_ex.ee_len = 0;
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
- */
- may_zeroout = ee_block + ee_len <= eof_block;
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_unwritten(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- return allocated;
- }
-
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
/*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
+ * Attempt to transfer newly initialized blocks from the currently
+ * unwritten extent to its neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. Transferring to the left is the common case in
+ * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+ * followed by append writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L2: we only attempt to merge with an extent stored in the
+ * same extent tree node.
*/
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
- /*
- * map->m_lblk == ee_block is handled by the zerouout
- * at the beginning.
- * Mark first half uninitialized.
- * Mark second half initialized and zero out the
- * initialized extent
- */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = cpu_to_le16(ee_len - allocated);
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
-
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex3, newblock);
- ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path,
- ex3, 0);
- if (err == -ENOSPC) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex,
- ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
+ if ((map->m_lblk == ee_block) &&
+ /* See if we can merge left */
+ (map_len < ee_len) && /*L1*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len;
+
+ abut_ex = ex - 1;
+ prev_lblk = le32_to_cpu(abut_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(abut_ex);
+ prev_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
- /*
- * We need to zero out the second half because
- * an fallocate request can update file size and
- * converting the second half to initialized extent
- * implies that we can leak some junk data to user
- * space.
- */
- err = ext4_ext_zeroout(inode, ex3);
- if (err) {
- /*
- * We should actually mark the
- * second half as uninit and return error
- * Insert would have changed the extent
- */
- depth = ext_depth(inode);
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk,
- path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- return err;
- }
- /* get the second half extent details */
- ex = path[depth].p_ext;
- err = ext4_ext_get_access(handle, inode,
- path + depth);
- if (err)
- return err;
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
- }
-
- /* zeroed the second half */
- return allocated;
- }
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
/*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
*/
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
+ /* Shift the start of ex by 'map_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + map_len);
+ ext4_ext_store_pblock(ex, ee_pblk + map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
- allocated = map->m_len;
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
- /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
- * to insert a extent in the middle zerout directly
- * otherwise give the extent a chance to merge to left
- */
- if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- map->m_lblk != ee_block && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- /* blocks available from map->m_lblk */
- return allocated;
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
}
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- if (ex2 != ex)
- goto insert;
- /*
- * New (initialized) extent starts from the first block
- * in the current extent. i.e., ex2 == ex
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex2 > EXT_FIRST_EXTENT(eh)) {
+ } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+ (map_len < ee_len) && /*L1*/
+ ex < EXT_LAST_EXTENT(eh)) { /*L2*/
+ /* See if we can merge right */
+ ext4_lblk_t next_lblk;
+ ext4_fsblk_t next_pblk, ee_pblk;
+ unsigned int next_len;
+
+ abut_ex = ex + 1;
+ next_lblk = le32_to_cpu(abut_ex->ee_block);
+ next_len = ext4_ext_get_actual_len(abut_ex);
+ next_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+
/*
- * To merge left, pass "ex2 - 1" to try_to_merge(),
- * since it merges towards right _only_.
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
*/
- ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
+ ((map->m_lblk + map_len) == next_lblk) && /*C2*/
+ ((ee_pblk + ee_len) == next_pblk) && /*C3*/
+ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- depth = ext_depth(inode);
- ex2--;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
+
+ /* Shift the start of abut_ex by 'map_len' blocks */
+ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+ ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(next_len + map_len);
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
}
}
+ if (allocated) {
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = abut_ex;
+ goto out;
+ } else
+ allocated = ee_len - (map->m_lblk - ee_block);
+
+ WARN_ON(map->m_lblk < ee_block);
/*
- * Try to Merge towards right. This might be required
- * only when the whole extent is being written to.
- * i.e. ex2 == ex and ex3 == NULL.
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
*/
- if (!ex3) {
- ret = ext4_ext_try_to_merge(inode, path, ex2);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ max_zeroout = sbi->s_extent_max_zeroout_kb >>
+ (inode->i_sb->s_blocksize_bits - 10);
+
+ /* If extent is less than s_max_zeroout_kb, zeroout directly */
+ if (max_zeroout && (ee_len <= max_zeroout)) {
+ err = ext4_ext_zeroout(inode, ex);
+ if (err)
+ goto out;
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
+ ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ goto out;
+ }
+
+ /*
+ * four cases:
+ * 1. split the extent into three extents.
+ * 2. split the extent into two extents, zeroout the first half.
+ * 3. split the extent into two extents, zeroout the second half.
+ * 4. split the extent into two extents with out zeroout.
+ */
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = map->m_len;
+
+ if (max_zeroout && (allocated > map->m_len)) {
+ if (allocated <= max_zeroout) {
+ /* case 3 */
+ zero_ex.ee_block =
+ cpu_to_le32(map->m_lblk);
+ zero_ex.ee_len = cpu_to_le16(allocated);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex);
if (err)
goto out;
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = allocated;
+ } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
+ /* case 2 */
+ if (map->m_lblk != ee_block) {
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ ee_block);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ }
+
+ split_map.m_lblk = ee_block;
+ split_map.m_len = map->m_lblk - ee_block + map->m_len;
+ allocated = map->m_len;
}
}
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
+
+ allocated = ext4_split_extent(handle, inode, path,
+ &split_map, split_flag, flags);
+ if (allocated < 0)
+ err = allocated;
+
out:
- ext4_ext_show_leaf(inode, path);
+ /* If we have gotten a failure, don't zero out status tree */
+ if (!err)
+ err = ext4_zeroout_es(inode, &zero_ex);
return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
}
/*
* This function is called by ext4_ext_map_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
- * to an uninitialized extent.
+ * to an unwritten extent.
*
- * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
* There are three possibilities:
- * a> There is no split required: Entire extent should be uninitialized
+ * a> There is no split required: Entire extent should be unwritten
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
* One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
- * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitialized extent called at this time will be split
- * into three uninitialized extent(at most). After IO complete, the part
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of uninitialized extent to be written on success.
+ * Returns the size of unwritten extent to be written on success.
*/
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path,
int flags)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
- int err = 0;
- int may_zeroout;
+ ext4_lblk_t eof_block;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len;
+ int split_flag = 0, depth;
- ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+ __func__, inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
-
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+ /* Convert to unwritten */
+ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ split_flag |= EXT4_EXT_DATA_VALID1;
+ /* Convert to initialized */
+ } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ split_flag |= ee_block + ee_len <= eof_block ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+ }
+ flags |= EXT4_GET_BLOCKS_PRE_IO;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+}
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
- */
- may_zeroout = ee_block + ee_len <= eof_block;
+static int ext4_convert_initialized_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
+ int err = 0;
- /*
- * If the uninitialized extent begins at the same logical
- * block where the write begins, and the write completely
- * covers the extent, then we don't need to split it.
- */
- if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
- return allocated;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
- */
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
- /*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
- */
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
+ ext_debug("%s: inode %lu, logical"
+ "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
- depth = newdepth;
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+ if (err < 0)
+ goto out;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
}
+ depth = ext_depth(inode);
ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ err = -EIO;
goto out;
-
- allocated = map->m_len;
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
+ }
}
- /*
- * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
- * using direct I/O, uninitialised still.
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as unwritten */
+ ext4_ext_mark_unwritten(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
*/
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- ext4_ext_mark_uninitialized(ex2);
- if (ex2 != ex)
- goto insert;
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- ext_debug("out here\n");
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
- return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
return err;
}
+
+
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path)
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
struct ext4_extent *ex;
- struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
int depth;
int err = 0;
- int ret = 0;
depth = ext_depth(inode);
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ /* If extent is larger than requested it is a clear sign that we still
+ * have some extent state machine issues left. So extent_split is still
+ * required.
+ * TODO: Once all related issues will be fixed this situation should be
+ * illegal.
+ */
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+ ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+ " len %u; IO logical block %llu, len %u\n",
+ inode->i_ino, (unsigned long long)ee_block, ee_len,
+ (unsigned long long)map->m_lblk, map->m_len);
+#endif
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3035,36 +3780,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
- /*
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- ex--;
- }
- }
- /*
- * Try to Merge towards right.
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
*/
- ret = ext4_ext_try_to_merge(inode, path, ex);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- }
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
return err;
@@ -3082,26 +3804,27 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
* Handle EOFBLOCKS_FL flag, clearing it if necessary
*/
static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map,
+ ext4_lblk_t lblk,
struct ext4_ext_path *path,
unsigned int len)
{
int i, depth;
struct ext4_extent_header *eh;
- struct ext4_extent *ex, *last_ex;
+ struct ext4_extent *last_ex;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
return 0;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (unlikely(!eh->eh_entries)) {
- EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
- "EOFBLOCKS_FL set");
- return -EIO;
- }
+ /*
+ * We're going to remove EOFBLOCKS_FL entirely in future so we
+ * do not care for this case anymore. Simply remove the flag
+ * if there are no extents.
+ */
+ if (unlikely(!eh->eh_entries))
+ goto out;
last_ex = EXT_LAST_EXTENT(eh);
/*
* We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3112,7 +3835,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
* this turns out to be false, we can bail out from this
* function immediately.
*/
- if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+ if (lblk + len < le32_to_cpu(last_ex->ee_block) +
ext4_ext_get_actual_len(last_ex))
return 0;
/*
@@ -3125,53 +3848,209 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
for (i = depth-1; i >= 0; i--)
if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
return 0;
+out:
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
return ext4_mark_inode_dirty(handle, inode);
}
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
+ */
+int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end)
+{
+ struct extent_status es;
+
+ ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
+ if (es.es_len == 0)
+ return 0; /* there is no delay extent in this tree */
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
+ return 1;
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+ return 1;
+ else
+ return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ * '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ * are not delalloc'ed.
+ * Ex:
+ * |----c---=|====c====|====c====|===-c----|
+ * |++++++ allocated ++++++|
+ * ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ * marked for delayed allocation. In this case, we will exclude that
+ * cluster.
+ * Ex:
+ * |----====c========|========c========|
+ * |++++++ allocated ++++++|
+ * ==> 1 complete clusters in above example
+ *
+ * Ex:
+ * |================c================|
+ * |++++++ allocated ++++++|
+ * ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+ unsigned int num_blks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+ ext4_lblk_t lblk_from, lblk_to, c_offset;
+ unsigned int allocated_clusters = 0;
+
+ alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+ alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+ /* max possible clusters for this allocation */
+ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+ trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+ /* Check towards left side */
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
+ if (c_offset) {
+ lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
+ lblk_to = lblk_from + c_offset - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+ allocated_clusters--;
+ }
+
+ /* Now check towards right. */
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
+ if (allocated_clusters && c_offset) {
+ lblk_from = lblk_start + num_blks;
+ lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+ allocated_clusters--;
+ }
+
+ return allocated_clusters;
+}
+
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+
+ /*
+ * Make sure that the extent is no bigger than we support with
+ * unwritten extent
+ */
+ if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+ map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+ ret = ext4_convert_initialized_extents(handle, inode, map,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+
+ return err ? err : allocated;
+}
+
static int
-ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
- ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
- "block %llu, max_blocks %u, flags %d, allocated %u",
+ ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
+ "block %llu, max_blocks %u, flags %x, allocated %u\n",
inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
+ /*
+ * When writing into unwritten space, we should not fail to
+ * allocate metadata blocks for the new extent block if needed.
+ */
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
+ trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
+ allocated, newblock);
+
/* get_block() before submit the IO, split the extent */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- ret = ext4_split_unwritten_extents(handle, inode, map,
- path, flags);
+ if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ ret = ext4_split_convert_extents(handle, inode, map,
+ path, flags | EXT4_GET_BLOCKS_CONVERT);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
- * that this IO needs to convertion to written when IO is
+ * that this IO needs to conversion to written when IO is
* completed
*/
if (io)
- io->flag = EXT4_IO_END_UNWRITTEN;
+ ext4_set_io_unwritten_flag(inode, io);
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
- if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
- ret = ext4_convert_unwritten_extents_endio(handle, inode,
+ if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
path);
if (ret >= 0) {
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map, path,
- map->m_len);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
} else
err = ret;
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
goto out2;
}
/* buffered IO case */
@@ -3179,8 +4058,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* repeat fallocate creation request
* we already have an unwritten extent
*/
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto map_out;
+ }
/* buffered READ or buffered write_begin() lookup */
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3196,14 +4077,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
- if (ret >= 0) {
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+ if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
- if (err < 0)
- goto out2;
- }
-
out:
if (ret <= 0) {
err = ret;
@@ -3224,6 +4100,7 @@ out:
allocated - map->m_len);
allocated = map->m_len;
}
+ map->m_len = allocated;
/*
* If we have done fallocate with the offset that is already
@@ -3232,11 +4109,24 @@ out:
* But fallocate would have already updated quota and block
* count for this offset. So cancel these reservation
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, allocated, 0);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, map->m_len);
+ if (reserved_clusters)
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters,
+ 0);
+ }
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+ map->m_len);
+ if (err < 0)
+ goto out2;
+ }
out1:
if (allocated > map->m_len)
allocated = map->m_len;
@@ -3244,14 +4134,113 @@ out1:
map->m_pblk = newblock;
map->m_len = allocated;
out2:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
return err ? err : allocated;
}
/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ * @sb The filesystem superblock structure
+ * @map The requested lblk->pblk mapping
+ * @ex The extent structure which might contain an implied
+ * cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree. Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree. There are three cases we
+ * want to catch. The first is this case:
+ *
+ * |--- cluster # N--|
+ * |--- extent ---| |---- requested region ---|
+ * |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ * |--------- cluster # N ----------------|
+ * |--- requested region --| |------- extent ----|
+ * |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region. Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+ struct ext4_map_blocks *map,
+ struct ext4_extent *ex,
+ struct ext4_ext_path *path)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+ ext4_lblk_t ex_cluster_start, ex_cluster_end;
+ ext4_lblk_t rr_cluster_start;
+ ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+ /* The extent passed in that we are trying to match */
+ ex_cluster_start = EXT4_B2C(sbi, ee_block);
+ ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+ /* The requested region passed into ext4_map_blocks() */
+ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+
+ if ((rr_cluster_start == ex_cluster_end) ||
+ (rr_cluster_start == ex_cluster_start)) {
+ if (rr_cluster_start == ex_cluster_end)
+ ee_start += ee_len - 1;
+ map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
+ map->m_len = min(map->m_len,
+ (unsigned) sbi->s_cluster_ratio - c_offset);
+ /*
+ * Check for and handle this case:
+ *
+ * |--------- cluster # N-------------|
+ * |------- extent ----|
+ * |--- requested region ---|
+ * |===========|
+ */
+
+ if (map->m_lblk < ee_block)
+ map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+ /*
+ * Check for the case where there is already another allocated
+ * block to the right of 'ex' but before the end of the cluster.
+ *
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ */
+ if (map->m_lblk > ee_block) {
+ ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+ map->m_len = min(map->m_len, next - map->m_lblk);
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+ return 1;
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+ return 0;
+}
+
+
+/*
* Block allocation/map/preallocation routine for extents based files
*
*
@@ -3273,45 +4262,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent_header *eh;
- struct ext4_extent newex, *ex;
- ext4_fsblk_t newblock;
- int err = 0, depth, ret, cache_type;
- unsigned int allocated = 0;
+ struct ext4_extent newex, *ex, *ex2;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_fsblk_t newblock = 0;
+ int free_on_err = 0, err = 0, depth, ret;
+ unsigned int allocated = 0, offset = 0;
+ unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
+ ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
-
- /* check in cache */
- cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
- if (cache_type) {
- if (cache_type == EXT4_EXT_CACHE_GAP) {
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- /*
- * block isn't allocated yet and
- * user doesn't want to allocate it
- */
- goto out2;
- }
- /* we should allocate requested block */
- } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
- /* block is already allocated */
- newblock = map->m_lblk
- - le32_to_cpu(newex.ee_block)
- + ext4_ext_pblock(&newex);
- /* number of remaining blocks in the extent */
- allocated = ext4_ext_get_actual_len(&newex) -
- (map->m_lblk - le32_to_cpu(newex.ee_block));
- goto out;
- } else {
- BUG();
- }
- }
+ trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -3333,7 +4300,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
err = -EIO;
goto out2;
}
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
if (ex) {
@@ -3341,11 +4307,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len;
+
/*
- * Uninitialized extents are treated as holes, except that
+ * unwritten extents are treated as holes, except that
* we split out initialized portions during a write.
*/
ee_len = ext4_ext_get_actual_len(ex);
+
+ trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
/* if found extent covers block, simply return it */
if (in_range(map->m_lblk, ee_block, ee_len)) {
newblock = map->m_lblk - ee_block + ee_start;
@@ -3354,20 +4324,34 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /* Do not put uninitialized extent in the cache */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start,
- EXT4_EXT_CACHE_EXTENT);
+ /*
+ * If the extent is initialized check whether the
+ * caller wants to convert it to unwritten.
+ */
+ if ((!ext4_ext_is_unwritten(ex)) &&
+ (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+ allocated = ext4_ext_convert_initialized_extent(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ goto out2;
+ } else if (!ext4_ext_is_unwritten(ex))
goto out;
- }
- ret = ext4_ext_handle_uninitialized_extents(handle,
- inode, map, path, flags, allocated,
- newblock);
- return ret;
+
+ ret = ext4_ext_handle_unwritten_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ if (ret < 0)
+ err = ret;
+ else
+ allocated = ret;
+ goto out2;
}
}
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
@@ -3377,12 +4361,29 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
+
/*
* Okay, we need to do block allocation.
*/
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+
+ /*
+ * If we are doing bigalloc, check to see if the extent returned
+ * by ext4_ext_find_extent() implies a cluster we can use.
+ */
+ if (cluster_offset && ex &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
/* find neighbour allocated blocks */
ar.lleft = map->m_lblk;
@@ -3390,27 +4391,37 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out2;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+ ex2 = NULL;
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
if (err)
goto out2;
+ /* Check if the extent after searching to the right implies a
+ * cluster we can use. */
+ if ((sbi->s_cluster_ratio > 1) && ex2 &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
+
/*
* See if request is beyond maximum number of blocks we can have in
* a single extent. For an initialized extent this limit is
- * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
- * EXT_UNINIT_MAX_LEN.
+ * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+ * EXT_UNWRITTEN_MAX_LEN.
*/
if (map->m_len > EXT_INIT_MAX_LEN &&
- !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+ !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
map->m_len = EXT_INIT_MAX_LEN;
- else if (map->m_len > EXT_UNINIT_MAX_LEN &&
- (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- map->m_len = EXT_UNINIT_MAX_LEN;
+ else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+ (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+ map->m_len = EXT_UNWRITTEN_MAX_LEN;
/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
- newex.ee_block = cpu_to_le32(map->m_lblk);
newex.ee_len = cpu_to_le16(map->m_len);
- err = ext4_ext_check_overlap(inode, &newex, path);
+ err = ext4_ext_check_overlap(sbi, inode, &newex, path);
if (err)
allocated = ext4_ext_get_actual_len(&newex);
else
@@ -3420,54 +4431,80 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode;
ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
ar.logical = map->m_lblk;
- ar.len = allocated;
+ /*
+ * We calculate the offset from the beginning of the cluster
+ * for the logical block number, since when we allocate a
+ * physical cluster, the physical block should start at the
+ * same offset from the beginning of the cluster. This is
+ * needed so that future calls to get_implied_cluster_alloc()
+ * work correctly.
+ */
+ offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+ ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+ ar.goal -= offset;
+ ar.logical -= offset;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
+ free_on_err = 1;
+ allocated_clusters = ar.len;
+ ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ if (ar.len > allocated)
+ ar.len = allocated;
+got_allocated_blocks:
/* try to insert new extent into found leaf and return */
- ext4_ext_store_pblock(&newex, newblock);
+ ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
- /* Mark uninitialized */
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
- ext4_ext_mark_uninitialized(&newex);
+ /* Mark unwritten */
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ ext4_ext_mark_unwritten(&newex);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
- * uninitialized extent. To avoid unecessary conversion,
+ * unwritten extent. To avoid unnecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
- * that we need to perform convertion when IO is done.
+ * that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- io->flag = EXT4_IO_END_UNWRITTEN;
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
+ if (flags & EXT4_GET_BLOCKS_PRE_IO)
+ set_unwritten = 1;
}
- err = check_eofblocks_fl(handle, inode, map, path, ar.len);
- if (err)
- goto out2;
+ err = 0;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, ar.len);
+ if (!err)
+ err = ext4_ext_insert_extent(handle, inode, path,
+ &newex, flags);
- err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err) {
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
+ if (err && free_on_err) {
+ int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+ EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), 0);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
goto out2;
}
@@ -3482,18 +4519,96 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* Update reserved blocks/metadata blocks after successful
* block allocation which had been deferred till now.
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, allocated, 1);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ /*
+ * Check how many clusters we had reserved this allocated range
+ */
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, allocated);
+ if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (reserved_clusters) {
+ /*
+ * We have clusters reserved for this range.
+ * But since we are not doing actual allocation
+ * and are simply using blocks from previously
+ * allocated cluster, we should release the
+ * reservation and not claim quota.
+ */
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters, 0);
+ }
+ } else {
+ BUG_ON(allocated_clusters < reserved_clusters);
+ if (reserved_clusters < allocated_clusters) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int reservation = allocated_clusters -
+ reserved_clusters;
+ /*
+ * It seems we claimed few clusters outside of
+ * the range of this allocation. We should give
+ * it back to the reservation pool. This can
+ * happen in the following case:
+ *
+ * * Suppose s_cluster_ratio is 4 (i.e., each
+ * cluster has 4 blocks. Thus, the clusters
+ * are [0-3],[4-7],[8-11]...
+ * * First comes delayed allocation write for
+ * logical blocks 10 & 11. Since there were no
+ * previous delayed allocated blocks in the
+ * range [8-11], we would reserve 1 cluster
+ * for this write.
+ * * Next comes write for logical blocks 3 to 8.
+ * In this case, we will reserve 2 clusters
+ * (for [0-3] and [4-7]; and not for [8-11] as
+ * that range has a delayed allocated blocks.
+ * Thus total reserved clusters now becomes 3.
+ * * Now, during the delayed allocation writeout
+ * time, we will first write blocks [3-8] and
+ * allocate 3 clusters for writing these
+ * blocks. Also, we would claim all these
+ * three clusters above.
+ * * Now when we come here to writeout the
+ * blocks [10-11], we would expect to claim
+ * the reservation of 1 cluster we had made
+ * (and we would claim it since there are no
+ * more delayed allocated blocks in the range
+ * [8-11]. But our reserved cluster count had
+ * already gone to 0.
+ *
+ * Thus, at the step 4 above when we determine
+ * that there are still some unwritten delayed
+ * allocated blocks outside of our current
+ * block range, we should increment the
+ * reserved clusters count so that when the
+ * remaining blocks finally gets written, we
+ * could claim them.
+ */
+ dquot_reserve_block(inode,
+ EXT4_C2B(sbi, reservation));
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks += reservation;
+ spin_unlock(&ei->i_block_reservation_lock);
+ }
+ /*
+ * We will claim quota for all newly allocated blocks.
+ * We're updating the reserved space *after* the
+ * correction above so we do not accidentally free
+ * all the metadata reservation because we might
+ * actually need it later on.
+ */
+ ext4_da_update_reserve_space(inode, allocated_clusters,
+ 1);
+ }
+ }
/*
* Cache the extent and update transaction to commit on fdatasync only
- * when it is _not_ an uninitialized extent.
+ * when it is _not_ an unwritten extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- } else
+ else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > map->m_len)
@@ -3507,37 +4622,20 @@ out2:
ext4_ext_drop_refs(path);
kfree(path);
}
+
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+ ext4_es_lru_add(inode);
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
- handle_t *handle;
int err = 0;
/*
- * probably first extent we're gonna free will be last in block
- */
- err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle))
- return;
-
- if (inode->i_size & (sb->s_blocksize - 1))
- ext4_block_truncate_page(handle, mapping, inode->i_size);
-
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
-
- ext4_discard_preallocations(inode);
-
- /*
* TODO: optimization is possible here.
* Probably we need not scan at all,
* because page truncation is enough.
@@ -3549,78 +4647,258 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block);
+retry:
+ err = ext4_es_remove_extent(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ return;
+ }
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ ext4_std_error(inode->i_sb, err);
+}
- /* In a multi-transaction truncate, we only make the final
- * transaction synchronous.
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+ ext4_lblk_t len, int flags, int mode)
+{
+ struct inode *inode = file_inode(file);
+ handle_t *handle;
+ int ret = 0;
+ int ret2 = 0;
+ int retries = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits;
+
+ map.m_lblk = offset;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
*/
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
+ if (len <= EXT_UNWRITTEN_MAX_LEN)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-out_stop:
- up_write(&EXT4_I(inode)->i_data_sem);
/*
- * If this was a simple ftruncate() and the file will remain alive,
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
+ * credits to insert 1 extent into extent tree
*/
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
+ credits = ext4_chunk_trans_blocks(inode, len);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
+retry:
+ while (ret >= 0 && ret < len) {
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = len = len - ret;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret <= 0) {
+ ext4_debug("inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ break;
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (ret2)
+ break;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+
+ return ret > 0 ? ret2 : ret;
}
-static void ext4_falloc_update_inode(struct inode *inode,
- int mode, loff_t new_size, int update_ctime)
+static long ext4_zero_range(struct file *file, loff_t offset,
+ loff_t len, int mode)
{
- struct timespec now;
+ struct inode *inode = file_inode(file);
+ handle_t *handle = NULL;
+ unsigned int max_blocks;
+ loff_t new_size = 0;
+ int ret = 0;
+ int flags;
+ int partial;
+ loff_t start, end;
+ ext4_lblk_t lblk;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned int blkbits = inode->i_blkbits;
+
+ trace_ext4_zero_range(inode, offset, len, mode);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
- if (update_ctime) {
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + len - 1);
+ if (ret)
+ return ret;
}
+
/*
- * Update only when preallocation was requested beyond
- * the file size.
+ * Round up offset. This is not fallocate, we neet to zero out
+ * blocks, so convert interior block aligned part of the range to
+ * unwritten and possibly manually zero out unaligned parts of the
+ * range.
*/
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ start = round_up(offset, 1 << blkbits);
+ end = round_down((offset + len), 1 << blkbits);
+
+ if (start < offset || end > offset + len)
+ return -EINVAL;
+ partial = (offset + len) & ((1 << blkbits) - 1);
+
+ lblk = start >> blkbits;
+ max_blocks = (end >> blkbits);
+ if (max_blocks < lblk)
+ max_blocks = 0;
+ else
+ max_blocks -= lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Indirect files do not support unwritten extnets
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out_mutex;
+ /*
+ * If we have a partial block after EOF we have to allocate
+ * the entire block.
+ */
+ if (partial)
+ max_blocks += 1;
+ }
+
+ if (max_blocks > 0) {
+
+ /* Now release the pages and zero block aligned part of pages*/
+ truncate_pagecache_range(inode, start, end - 1);
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Remove entire range from the extent status tree.
+ */
+ ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+ if (ret)
+ goto out_dio;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+ mode);
+ if (ret)
+ goto out_dio;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, ret);
+ goto out_dio;
+ }
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
if (new_size > i_size_read(inode))
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
} else {
/*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if (new_size > i_size_read(inode))
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
* operation, which gets called from sys_fallocate system call.
* For block-mapped files, posix_fallocate should fall back to the method
* of writing zeroes to the required new blocks (the same behavior which is
* expected for file systems which do not support fallocate() system call).
*/
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
handle_t *handle;
- loff_t new_size;
+ loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
- int ret2 = 0;
- int retries = 0;
- struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ int flags;
+ ext4_lblk_t lblk;
+ struct timespec tv;
+ unsigned int blkbits = inode->i_blkbits;
+
+ /* Return error if mode is not supported */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return ext4_punch_hole(inode, offset, len);
+
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ return ret;
/*
* currently supporting (pre)allocate mode for extent-based
@@ -3629,70 +4907,69 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
- /* preallocation to directories is currently not supported */
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
- map.m_lblk = offset >> blkbits;
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ return ext4_zero_range(file, offset, len, mode);
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - map.m_lblk;
- /*
- * credits to insert 1 extent into extent tree
- */
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ - lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
- }
-retry:
- while (ret >= 0 && ret < max_blocks) {
- map.m_lblk = map.m_lblk + ret;
- map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
- }
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
- if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, max_blocks);
-#endif
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- break;
- }
- if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
- blkbits) >> blkbits))
- new_size = offset + len;
- else
- new_size = (map.m_lblk + ret) << blkbits;
- ext4_falloc_update_inode(inode, mode, new_size,
- (map.m_flags & EXT4_MAP_NEW));
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
- break;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
- goto retry;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ if (ret)
+ goto out;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ goto out;
+
+ tv = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
+ if (new_size > i_size_read(inode)) {
+ i_size_write(inode, new_size);
+ inode->i_mtime = tv;
+ }
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out:
mutex_unlock(&inode->i_mutex);
- return ret > 0 ? ret2 : ret;
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+ return ret;
}
/*
@@ -3705,10 +4982,9 @@ retry:
* function, to convert the fallocated extents after IO is completed.
* Returns 0 on success.
*/
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
{
- handle_t *handle;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
@@ -3723,109 +4999,98 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
map.m_lblk);
/*
- * credits to insert 1 extent into extent tree
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
*/
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
- if (ret <= 0) {
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, map.m_len);
- }
+ if (ret <= 0)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret <= 0 || ret2 )
+ if (credits)
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
}
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+
/*
- * Callback function called for each extent to gather FIEMAP information.
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newes unmodified.
*/
-static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
- struct ext4_ext_cache *newex, struct ext4_extent *ex,
- void *data)
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct extent_status *newes)
{
- struct fiemap_extent_info *fieinfo = data;
- unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
- __u64 logical;
- __u64 physical;
- __u64 length;
- __u32 flags = 0;
- int error;
-
- logical = (__u64)newex->ec_block << blksize_bits;
-
- if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
- pgoff_t offset;
- struct page *page;
- struct buffer_head *bh = NULL;
+ struct extent_status es;
+ ext4_lblk_t block, next_del;
- offset = logical >> PAGE_SHIFT;
- page = find_get_page(inode->i_mapping, offset);
- if (!page || !page_has_buffers(page))
- return EXT_CONTINUE;
+ if (newes->es_pblk == 0) {
+ ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1, &es);
- bh = page_buffers(page);
-
- if (!bh)
- return EXT_CONTINUE;
+ /*
+ * No extent in extent-tree contains block @newes->es_pblk,
+ * then the block may stay in 1)a hole or 2)delayed-extent.
+ */
+ if (es.es_len == 0)
+ /* A hole found. */
+ return 0;
- if (buffer_delay(bh)) {
- flags |= FIEMAP_EXTENT_DELALLOC;
- page_cache_release(page);
- } else {
- page_cache_release(page);
- return EXT_CONTINUE;
+ if (es.es_lblk > newes->es_lblk) {
+ /* A hole found. */
+ newes->es_len = min(es.es_lblk - newes->es_lblk,
+ newes->es_len);
+ return 0;
}
- }
-
- physical = (__u64)newex->ec_start << blksize_bits;
- length = (__u64)newex->ec_len << blksize_bits;
-
- if (ex && ext4_ext_is_uninitialized(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
-
- /*
- * If this extent reaches EXT_MAX_BLOCK, it must be last.
- *
- * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
- * this also indicates no more allocated blocks.
- *
- * XXX this might miss a single-block extent at EXT_MAX_BLOCK
- */
- if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
- newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
- loff_t size = i_size_read(inode);
- loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
- flags |= FIEMAP_EXTENT_LAST;
- if ((flags & FIEMAP_EXTENT_DELALLOC) &&
- logical+length > size)
- length = (size - logical + bs - 1) & ~(bs-1);
+ newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
}
- error = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, flags);
- if (error < 0)
- return error;
- if (error == 1)
- return EXT_BREAK;
+ block = newes->es_lblk + newes->es_len;
+ ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+ if (es.es_len == 0)
+ next_del = EXT_MAX_BLOCKS;
+ else
+ next_del = es.es_lblk;
- return EXT_CONTINUE;
+ return next_del;
}
-
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -3846,7 +5111,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
error = ext4_get_inode_loc(inode, &iloc);
if (error)
return error;
- physical = iloc.bh->b_blocknr << blockbits;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
offset = EXT4_GOOD_OLD_INODE_SIZE +
EXT4_I(inode)->i_extra_isize;
physical += offset;
@@ -3854,7 +5119,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
} else { /* external block */
- physical = EXT4_I(inode)->i_file_acl << blockbits;
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
}
@@ -3870,6 +5135,21 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ext4_lblk_t start_blk;
int error = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+ if (has_inline)
+ return error;
+ }
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ }
+
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
@@ -3886,18 +5166,347 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
start_blk = start >> inode->i_sb->s_blocksize_bits;
last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
- if (last_blk >= EXT_MAX_BLOCK)
- last_blk = EXT_MAX_BLOCK-1;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
- * Walk the extent tree gathering extent information.
- * ext4_ext_fiemap_cb will push extents back to user.
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
*/
- error = ext4_ext_walk_space(inode, start_blk, len_blks,
- ext4_ext_fiemap_cb, fieinfo);
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
}
-
+ ext4_es_lru_add(inode);
return error;
}
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int credits, err;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ /*
+ * Check if need to extend journal credits
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups
+ */
+ if (handle->h_buffer_credits < 7) {
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ /* EAGAIN is success */
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path);
+ return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+ struct inode *inode, handle_t *handle,
+ ext4_lblk_t *start)
+{
+ int depth, err = 0;
+ struct ext4_extent *ex_start, *ex_last;
+ bool update = 0;
+ depth = path->p_depth;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ ex_start = path[depth].p_ext;
+ if (!ex_start)
+ return -EIO;
+
+ ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ if (!ex_last)
+ return -EIO;
+
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+ update = 1;
+
+ *start = le32_to_cpu(ex_last->ee_block) +
+ ext4_ext_get_actual_len(ex_last);
+
+ while (ex_start <= ex_last) {
+ le32_add_cpu(&ex_start->ee_block, -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
+ }
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (--depth < 0 || !update)
+ break;
+ }
+
+ /* Update index too */
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ /* we are done if current index is not a starting index */
+ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+ break;
+
+ depth--;
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+ ext4_lblk_t start, ext4_lblk_t shift)
+{
+ struct ext4_ext_path *path;
+ int ret = 0, depth;
+ struct ext4_extent *extent;
+ ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t ex_start, ex_end;
+
+ /* Let path point to the last extent */
+ path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+ }
+
+ stop_block = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Nothing to shift, if hole is at the end of file */
+ if (start >= stop_block)
+ return ret;
+
+ /*
+ * Don't start shifting extents until we make sure the hole is big
+ * enough to accomodate the shift.
+ */
+ path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if ((start == ex_start && shift > ex_start) ||
+ (shift > start - ex_end))
+ return -EINVAL;
+
+ /* Its safe to start updating extents */
+ while (start < stop_block) {
+ path = ext4_ext_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) start);
+ return -EIO;
+ }
+
+ current_block = le32_to_cpu(extent->ee_block);
+ if (start > current_block) {
+ /* Hole, move to the next extent */
+ ret = mext_next_extent(inode, path, &extent);
+ if (ret != 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+ ret = ext4_ext_shift_path_extents(path, shift, inode,
+ handle, &start);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t punch_start, punch_stop;
+ handle_t *handle;
+ unsigned int credits;
+ loff_t new_size, ioffset;
+ int ret;
+
+ /* Collapse range works only on fs block size aligned offsets. */
+ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ len & (EXT4_BLOCK_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ trace_ext4_collapse_range(inode, offset, len);
+
+ punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case
+ * it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ ret = -EINVAL;
+ goto out_mutex;
+ }
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ truncate_pagecache(inode, ioffset);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ new_size = i_size_read(inode) - len;
+ i_size_write(inode, new_size);
+ EXT4_I(inode)->i_disksize = new_size;
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 00000000000..0b7e28e7eaa
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,1127 @@
+/*
+ * fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Hugh Dickins <hughd@google.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include <linux/list_sort.h>
+#include "ext4.h"
+#include "extents_status.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal. It is
+ * original built by Yongqiang Yang. At that time it is called delay
+ * extent tree, whose goal is only track delayed extents in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
+ *
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
+ * invalidated. Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory. written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure. Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
+ */
+
+/*
+ * Extent status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extent status tree tracks all extent status.
+ *
+ * 1. Why we need to implement extent status tree?
+ *
+ * Without extent status tree, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
+ *
+ * Let us have a look at how they do without extent status tree.
+ * -- FIEMAP
+ * FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ * -- SEEK_HOLE/DATA
+ * SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ * -- bigalloc
+ * bigalloc looks up page cache to figure out if a block is
+ * already under delayed allocation or not to determine whether
+ * quota reserving is needed for the cluster.
+ *
+ * -- writeout
+ * Writeout looks up whole page cache to see if a buffer is
+ * mapped, If there are not very many delayed buffers, then it is
+ * time comsuming.
+ *
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
+ *
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
+ *
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
+ *
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
+ *
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten/hole extents from
+ * the tree under a heavy memory pressure.
+ *
+ *
+ * ==========================================================================
+ * 3. Performance analysis
+ *
+ * -- overhead
+ * 1. There is a cache extent for write access, so if writes are
+ * not very random, adding space operaions are in O(1) time.
+ *
+ * -- gain
+ * 2. Code is much simpler, more readable, more maintainable and
+ * more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *
+ * -- Refactor delayed space reservation
+ *
+ * -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
+
+int __init ext4_init_es(void)
+{
+ ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+ sizeof(struct extent_status),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ if (ext4_es_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_es(void)
+{
+ if (ext4_es_cachep)
+ kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+ tree->root = RB_ROOT;
+ tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_es_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ struct extent_status *es;
+ es = rb_entry(node, struct extent_status, rb_node);
+ printk(KERN_DEBUG " [%u/%u) %llu %x",
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
+{
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset. If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+ ext4_lblk_t lblk)
+{
+ struct rb_node *node = root->rb_node;
+ struct extent_status *es = NULL;
+
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es))
+ node = node->rb_right;
+ else
+ return es;
+ }
+
+ if (es && lblk < es->es_lblk)
+ return es;
+
+ if (es && lblk > ext4_es_end(es)) {
+ node = rb_next(&es->rb_node);
+ return node ? rb_entry(node, struct extent_status, rb_node) :
+ NULL;
+ }
+
+ return NULL;
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ struct ext4_es_tree *tree = NULL;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+
+ BUG_ON(es == NULL);
+ BUG_ON(end < lblk);
+ trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %x\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
+ goto out;
+ }
+ }
+
+ es1 = __es_tree_search(&tree->root, lblk);
+
+out:
+ if (es1 && !ext4_es_is_delayed(es1)) {
+ while ((node = rb_next(&es1->rb_node)) != NULL) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->es_lblk > end) {
+ es1 = NULL;
+ break;
+ }
+ if (ext4_es_is_delayed(es1))
+ break;
+ }
+ }
+
+ if (es1 && ext4_es_is_delayed(es1)) {
+ tree->cache_es = es1;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
+{
+ struct extent_status *es;
+ es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+ if (es == NULL)
+ return NULL;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es)) {
+ EXT4_I(inode)->i_es_lru_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
+ return es;
+}
+
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
+ kmem_cache_free(ext4_es_cachep, es);
+}
+
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (ext4_es_status(es1) != ext4_es_status(es2))
+ return 0;
+
+ if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+ pr_warn("ES assertion failed when merging extents. "
+ "The sum of lengths of es1 (%d) and es2 (%d) "
+ "is bigger than allowed file size (%d)\n",
+ es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+ WARN_ON(1);
+ return 0;
+ }
+
+ if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+ (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+ return 1;
+
+ if (ext4_es_is_hole(es1))
+ return 1;
+
+ /* we need to check delayed extent is without unwritten status */
+ if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ return 1;
+
+ return 0;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ es = es1;
+ }
+
+ return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_next(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
+ rb_erase(node, &tree->root);
+ ext4_es_free_extent(inode, es1);
+ }
+
+ return es;
+}
+
+#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
+
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+ struct extent_status *es)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_start;
+ unsigned short ee_len;
+ int depth, ee_status, es_status;
+
+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ if (ex) {
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_start = ext4_ext_pblock(ex);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
+ es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+ /*
+ * Make sure ex and es are not overlap when we try to insert
+ * a delayed/hole extent.
+ */
+ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+ if (in_range(es->es_lblk, ee_block, ee_len)) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu we can find an extent "
+ "at block [%d/%d/%llu/%c], but we "
+ "want to add a delayed/hole extent "
+ "[%d/%d/%llu/%x]\n",
+ inode->i_ino, ee_block, ee_len,
+ ee_start, ee_status ? 'u' : 'w',
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ }
+ goto out;
+ }
+
+ /*
+ * We don't check ee_block == es->es_lblk, etc. because es
+ * might be a part of whole extent, vice versa.
+ */
+ if (es->es_lblk < ee_block ||
+ ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "ex_status [%d/%d/%llu/%c] != "
+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+ ee_block, ee_len, ee_start,
+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+ ext4_es_pblock(es), es_status ? 'u' : 'w');
+ goto out;
+ }
+
+ if (ee_status ^ es_status) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "ex_status [%d/%d/%llu/%c] != "
+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+ ee_block, ee_len, ee_start,
+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+ ext4_es_pblock(es), es_status ? 'u' : 'w');
+ }
+ } else {
+ /*
+ * We can't find an extent on disk. So we need to make sure
+ * that we don't want to add an written/unwritten extent.
+ */
+ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "can't find an extent at block %d but we want "
+ "to add a written/unwritten extent "
+ "[%d/%d/%llu/%x]\n", inode->i_ino,
+ es->es_lblk, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ }
+ }
+out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+ struct extent_status *es)
+{
+ struct ext4_map_blocks map;
+ int retval;
+
+ /*
+ * Here we call ext4_ind_map_blocks to lookup a block mapping because
+ * 'Indirect' structure is defined in indirect.c. So we couldn't
+ * access direct/indirect tree from outside. It is too dirty to define
+ * this function in indirect.c file.
+ */
+
+ map.m_lblk = es->es_lblk;
+ map.m_len = es->es_len;
+
+ retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+ if (retval > 0) {
+ if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+ /*
+ * We want to add a delayed/hole extent but this
+ * block has been allocated.
+ */
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "We can find blocks but we want to add a "
+ "delayed/hole extent [%d/%d/%llu/%x]\n",
+ inode->i_ino, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ return;
+ } else if (ext4_es_is_written(es)) {
+ if (retval != es->es_len) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu retval %d != es_len %d\n",
+ inode->i_ino, retval, es->es_len);
+ return;
+ }
+ if (map.m_pblk != ext4_es_pblock(es)) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu m_pblk %llu != "
+ "es_pblk %llu\n",
+ inode->i_ino, map.m_pblk,
+ ext4_es_pblock(es));
+ return;
+ }
+ } else {
+ /*
+ * We don't need to check unwritten extent because
+ * indirect-based file doesn't have it.
+ */
+ BUG_ON(1);
+ }
+ } else if (retval == 0) {
+ if (ext4_es_is_written(es)) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "We can't find the block but we want to add "
+ "a written extent [%d/%d/%llu/%x]\n",
+ inode->i_ino, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ return;
+ }
+ }
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+ struct extent_status *es)
+{
+ /*
+ * We don't need to worry about the race condition because
+ * caller takes i_data_sem locking.
+ */
+ BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_es_insert_extent_ext_check(inode, es);
+ else
+ ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+ struct extent_status *es)
+{
+}
+#endif
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_status *es;
+
+ while (*p) {
+ parent = *p;
+ es = rb_entry(parent, struct extent_status, rb_node);
+
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ /*
+ * Here we can modify es_lblk directly
+ * because it isn't overlapped.
+ */
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es))
+ ext4_es_store_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
+ goto out;
+ }
+ p = &(*p)->rb_left;
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
+ goto out;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG_ON(1);
+ return -EINVAL;
+ }
+ }
+
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+ if (!es)
+ return -ENOMEM;
+ rb_link_node(&es->rb_node, parent, p);
+ rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+ tree->cache_es = es;
+ return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+ int err = 0;
+
+ es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ if (!len)
+ return 0;
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock_status(&newes, pblk, status);
+ trace_ext4_es_insert_extent(inode, &newes);
+
+ ext4_es_insert_extent_check(inode, &newes);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
+
+error:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+
+ return err;
+}
+
+/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status *es;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock_status(&newes, pblk, status);
+ trace_ext4_es_cache_extent(inode, &newes);
+
+ if (!len)
+ return;
+
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
+ *
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
+ */
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
+{
+ struct ext4_es_tree *tree;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, lblk);
+ es_debug("lookup extent in block %u\n", lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ struct extent_status orig_es;
+ ext4_lblk_t len1, len2;
+ ext4_fsblk_t block;
+ int err;
+
+retry:
+ err = 0;
+ es = __es_tree_search(&tree->root, lblk);
+ if (!es)
+ goto out;
+ if (es->es_lblk > end)
+ goto out;
+
+ /* Simply invalidate cache_es. */
+ tree->cache_es = NULL;
+
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
+ if (len1 > 0)
+ es->es_len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ block = 0x7FDEADBEEFULL;
+ if (ext4_es_is_written(&orig_es) ||
+ ext4_es_is_unwritten(&orig_es))
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock_status(&newes, block,
+ ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
+ if (err) {
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ goto out;
+ }
+ } else {
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es)) {
+ block = orig_es.es_pblk + orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+ }
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&es->rb_node);
+ if (node)
+ es = rb_entry(node, struct extent_status, rb_node);
+ else
+ es = NULL;
+ }
+
+ while (es && ext4_es_end(es) <= end) {
+ node = rb_next(&es->rb_node);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ if (!node) {
+ es = NULL;
+ break;
+ }
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+ block = es->es_pblk + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+ }
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ if (!len)
+ return err;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_print_tree(inode);
+ return err;
+}
+
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
+
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
+{
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skipped);
+ int nr_shrunk = 0;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
+
+ spin_lock(&sbi->s_es_lru_lock);
+
+retry:
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ int shrunk;
+
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ /*
+ * Skip the inode that is newer than the last_sorted
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+ if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+ list_move_tail(cur, &skipped);
+ continue;
+ }
+
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+ !write_trylock(&ei->i_es_lock))
+ continue;
+
+ shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += shrunk;
+ nr_to_scan -= shrunk;
+ if (nr_to_scan == 0)
+ break;
+ }
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skipped, &sbi->s_es_lru);
+ INIT_LIST_HEAD(&skipped);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+ * forward progress, sort the list and try again.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+ * If there are no non-precached inodes left on the
+ * list, start releasing precached extents.
+ */
+ if (ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))
+ skip_precached = 0;
+ goto retry;
+ }
+
+ spin_unlock(&sbi->s_es_lru_lock);
+
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+ return nr_shrunk;
+}
+
+static unsigned long ext4_es_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long nr;
+ struct ext4_sb_info *sbi;
+
+ sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+ return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return nr_shrunk;
+}
+
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
+ sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker.count_objects = ext4_es_count;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+ unregister_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ unsigned long nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 00000000000..f1b62a41992
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,146 @@
+/*
+ * fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+#define ES_SHIFT 60
+
+#define EXTENT_STATUS_WRITTEN (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED (1 << 1)
+#define EXTENT_STATUS_HOLE (1 << 0)
+
+#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
+ EXTENT_STATUS_UNWRITTEN | \
+ EXTENT_STATUS_DELAYED | \
+ EXTENT_STATUS_HOLE)
+
+#define ES_WRITTEN (1ULL << 63)
+#define ES_UNWRITTEN (1ULL << 62)
+#define ES_DELAYED (1ULL << 61)
+#define ES_HOLE (1ULL << 60)
+
+#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+ ES_DELAYED | ES_HOLE)
+
+struct ext4_sb_info;
+struct ext4_extent;
+
+struct extent_status {
+ struct rb_node rb_node;
+ ext4_lblk_t es_lblk; /* first logical block extent covers */
+ ext4_lblk_t es_len; /* length of extent in block */
+ ext4_fsblk_t es_pblk; /* first physical block */
+};
+
+struct ext4_es_tree {
+ struct rb_root root;
+ struct extent_status *cache_es; /* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+ return (es->es_pblk & ES_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+ return (es->es_pblk & ES_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+ return (es->es_pblk & ES_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+ return (es->es_pblk & ES_HOLE) != 0;
+}
+
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+ return es->es_pblk >> ES_SHIFT;
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+ return es->es_pblk & ~ES_MASK;
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+ ext4_fsblk_t pb)
+{
+ ext4_fsblk_t block;
+
+ block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
+ es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (es->es_pblk & ~ES_MASK));
+}
+
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ ext4_fsblk_t pb,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (pb & ~ES_MASK));
+}
+
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddcee..8695f70af1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,7 +23,9 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/aio.h>
#include <linux/quotaops.h>
+#include <linux/pagevec.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -55,37 +57,145 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static void ext4_unwritten_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete. Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block. If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+{
+ struct super_block *sb = inode->i_sb;
+ int blockmask = sb->s_blocksize - 1;
+
+ if (pos >= i_size_read(inode))
+ return 0;
+
+ if ((pos | iov_iter_alignment(from)) & blockmask)
+ return 1;
+
+ return 0;
+}
+
static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct mutex *aio_mutex = NULL;
+ struct blk_plug plug;
+ int o_direct = file->f_flags & O_DIRECT;
+ int overwrite = 0;
+ size_t length = iov_iter_count(from);
+ ssize_t ret;
+ loff_t pos = iocb->ki_pos;
+
+ /*
+ * Unaligned direct AIO must be serialized; see comment above
+ * In the case of O_APPEND, assume that we must always serialize
+ */
+ if (o_direct &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) &&
+ (file->f_flags & O_APPEND ||
+ ext4_unaligned_aio(inode, from, pos))) {
+ aio_mutex = ext4_aio_mutex(inode);
+ mutex_lock(aio_mutex);
+ ext4_unwritten_wait(inode);
+ }
+
+ mutex_lock(&inode->i_mutex);
+ if (file->f_flags & O_APPEND)
+ iocb->ki_pos = pos = i_size_read(inode);
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
-
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- size_t length = iov_length(iov, nr_segs);
- if ((pos > sbi->s_bitmap_maxbytes ||
- (pos == sbi->s_bitmap_maxbytes && length > 0)))
- return -EFBIG;
+ if ((pos > sbi->s_bitmap_maxbytes) ||
+ (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EFBIG;
+ goto errout;
+ }
+
+ if (pos + length > sbi->s_bitmap_maxbytes)
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
+ }
+
+ if (o_direct) {
+ blk_start_plug(&plug);
+
+ iocb->private = &overwrite;
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has
+ * been preallocated no matter they are
+ * initialized or not. For excluding
+ * unwritten extents, we need to check
+ * m_flags. There are two conditions that
+ * indicate for initialized extents. 1) If we
+ * hit extent cache, EXT4_MAP_MAPPED flag is
+ * returned; 2) If we do a real lookup,
+ * non-flags are returned. So we should check
+ * these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
}
}
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ ret = __generic_file_write_iter(iocb, from);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ if (o_direct)
+ blk_finish_plug(&plug);
+
+errout:
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
+ return ret;
}
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -96,7 +206,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
@@ -122,62 +231,366 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
if (!IS_ERR(cp)) {
- memcpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
- ext4_mark_super_dirty(sb);
+ handle_t *handle;
+ int err;
+
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err) {
+ ext4_journal_stop(handle);
+ return err;
+ }
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ ext4_handle_dirty_super(handle, sb);
+ ext4_journal_stop(handle);
}
}
+ /*
+ * Set up the jbd2_inode if we are opening the inode for
+ * writing and the journal is present
+ */
+ if (filp->f_mode & FMODE_WRITE) {
+ int ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ return ret;
+ }
return dquot_file_open(inode, filp);
}
/*
- * ext4_llseek() copied from generic_file_llseek() to handle both
- * block-mapped and extent-mapped maxbytes values. This should
- * otherwise be identical with generic_file_llseek().
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function. When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
*/
-loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+ int whence,
+ struct ext4_map_blocks *map,
+ loff_t *offset)
+{
+ struct pagevec pvec;
+ unsigned int blkbits;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff;
+ loff_t lastoff;
+ int found = 0;
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ startoff = *offset;
+ lastoff = startoff;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ do {
+ int i, num;
+ unsigned long nr_pages;
+
+ num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ (pgoff_t)num);
+ if (nr_pages == 0) {
+ if (whence == SEEK_DATA)
+ break;
+
+ BUG_ON(whence != SEEK_HOLE);
+ /*
+ * If this is the first time to go into the loop and
+ * offset is not beyond the end offset, it will be a
+ * hole at this offset
+ */
+ if (lastoff == startoff || lastoff < endoff)
+ found = 1;
+ break;
+ }
+
+ /*
+ * If this is the first time to go into the loop and
+ * offset is smaller than the first page offset, it will be a
+ * hole at this offset.
+ */
+ if (lastoff == startoff && whence == SEEK_HOLE &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = 1;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ /*
+ * If the current offset is not beyond the end of given
+ * range, it will be a hole.
+ */
+ if (lastoff < endoff && whence == SEEK_HOLE &&
+ page->index > end) {
+ found = 1;
+ *offset = lastoff;
+ goto out;
+ }
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (page_has_buffers(page)) {
+ lastoff = page_offset(page);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_uptodate(bh) ||
+ buffer_unwritten(bh)) {
+ if (whence == SEEK_DATA)
+ found = 1;
+ } else {
+ if (whence == SEEK_HOLE)
+ found = 1;
+ }
+ if (found) {
+ *offset = max_t(loff_t,
+ startoff, lastoff);
+ unlock_page(page);
+ goto out;
+ }
+ lastoff += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The no. of pages is less than our desired, that would be a
+ * hole in there.
+ */
+ if (nr_pages < num && whence == SEEK_HOLE) {
+ found = 1;
+ *offset = lastoff;
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t dataoff, isize;
+ int blkbits;
+ int ret = 0;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
mutex_lock(&inode->i_mutex);
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- if (offset == 0) {
- mutex_unlock(&inode->i_mutex);
- return file->f_pos;
- }
- offset += file->f_pos;
- break;
- }
- if (offset < 0 || offset > maxbytes) {
+ isize = i_size_read(inode);
+ if (offset >= isize) {
mutex_unlock(&inode->i_mutex);
- return -EINVAL;
+ return -ENXIO;
}
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ dataoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * it will be as a data.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ &map, &dataoff);
+ if (unwritten)
+ break;
+ }
+
+ last++;
+ dataoff = (loff_t)last << blkbits;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (dataoff > isize)
+ return -ENXIO;
+
+ return vfs_setpos(file, dataoff, maxsize);
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t holeoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
}
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ holeoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * we will skip this extent.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ last = es.es_lblk + es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ &map, &holeoff);
+ if (!unwritten) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+ }
+
+ /* find a hole */
+ break;
+ } while (last <= end);
+
mutex_unlock(&inode->i_mutex);
- return offset;
+ if (holeoff > isize)
+ holeoff = isize;
+
+ return vfs_setpos(file, holeoff, maxsize);
+}
+
+/*
+ * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
+ * by calling generic_file_llseek_size() with the appropriate maxbytes
+ * value for each.
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxbytes;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ else
+ maxbytes = inode->i_sb->s_maxbytes;
+
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ return generic_file_llseek_size(file, offset, whence,
+ maxbytes, i_size_read(inode));
+ case SEEK_DATA:
+ return ext4_seek_data(file, offset, maxbytes);
+ case SEEK_HOLE:
+ return ext4_seek_hole(file, offset, maxbytes);
+ }
+
+ return -EINVAL;
}
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
@@ -187,21 +600,19 @@ const struct file_operations ext4_file_operations = {
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
- .truncate = ext4_truncate,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .check_acl = ext4_check_acl,
- .fallocate = ext4_fallocate,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf..a8bc47f75fa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,89 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-static int flush_completed_IO(struct inode *inode)
-{
- ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
-
- if (list_empty(&ei->i_completed_io_list))
- return ret;
-
- dump_completed_IO(inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * convertion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (ret < 0)
- ret2 = ret;
- else
- list_del_init(&io->list);
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
-}
-
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
@@ -125,19 +42,35 @@ static int flush_completed_IO(struct inode *inode)
* the parent directory's parent as well, and so on recursively, if
* they are also freshly created.
*/
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
{
struct dentry *dentry = NULL;
+ struct inode *next;
+ int ret = 0;
- while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ return 0;
+ inode = igrab(inode);
+ while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
- dentry = list_entry(inode->i_dentry.next,
- struct dentry, d_alias);
- if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ break;
+ next = igrab(dentry->d_parent->d_inode);
+ dput(dentry);
+ if (!next)
+ break;
+ iput(inode);
+ inode = next;
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (ret)
+ break;
+ ret = sync_inode_metadata(inode, 1);
+ if (ret)
break;
- inode = dentry->d_parent->d_inode;
- sync_mapping_buffers(inode->i_mapping);
}
+ iput(inode);
+ return ret;
}
/*
@@ -150,36 +83,39 @@ static void ext4_sync_parent(struct inode *inode)
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
- *
- * i_mutex lock is held when entering and exiting this function
*/
-int ext4_sync_file(struct file *file, int datasync)
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret;
+ int ret = 0, err;
tid_t commit_tid;
+ bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
- trace_ext4_sync_file(file, datasync);
+ trace_ext4_sync_file_enter(file, datasync);
- if (inode->i_sb->s_flags & MS_RDONLY)
- return 0;
-
- ret = flush_completed_IO(inode);
- if (ret < 0)
- return ret;
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
+ goto out;
+ }
if (!journal) {
- ret = generic_file_fsync(file, datasync);
- if (!ret && !list_empty(&inode->i_dentry))
- ext4_sync_parent(inode);
- return ret;
+ ret = generic_file_fsync(file, start, end, datasync);
+ if (!ret && !hlist_empty(&inode->i_dentry))
+ ret = ext4_sync_parent(inode);
+ goto out;
}
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -194,26 +130,22 @@ int ext4_sync_file(struct file *file, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode))
- return ext4_force_commit(inode->i_sb);
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ goto out;
+ }
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (jbd2_log_start_commit(journal, commit_tid)) {
- /*
- * When the journal is on a different device than the
- * fs data disk, we need to issue the barrier in
- * writeback mode. (In ordered mode, the jbd2 layer
- * will take care of issuing the barrier. In
- * data=journal, all of the data blocks are written to
- * the journal device.)
- */
- if (ext4_should_writeback_data(inode) &&
- (journal->j_fs_dev != journal->j_dev) &&
- (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
- NULL);
- ret = jbd2_log_wait_commit(journal, commit_tid);
- } else if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ ret = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
+out:
+ trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index ac8f168c8ab..3d586f02883 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
for (i = 0; i < 4; i++) {
- if (hinfo->seed[i])
+ if (hinfo->seed[i]) {
+ memcpy(buf, hinfo->seed, sizeof(buf));
break;
+ }
}
- if (i < 4)
- memcpy(buf, hinfo->seed, sizeof(buf));
}
switch (hinfo->hash_version) {
@@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT4_HTREE_EOF << 1))
- hash = (EXT4_HTREE_EOF-1) << 1;
+ if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+ hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..5b87fc36aab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,28 +70,49 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
+ struct ext4_group_info *grp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
-
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_blks_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
bh->b_data);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
return EXT4_INODES_PER_GROUP(sb);
}
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -104,6 +125,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -118,12 +141,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
return NULL;
}
if (bitmap_uptodate(bh))
- return bh;
+ goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
- return bh;
+ goto verify;
}
ext4_lock_group(sb, block_group);
@@ -131,6 +154,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
@@ -144,22 +168,45 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
- return bh;
+ goto verify;
}
/*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
+ * submit the buffer_head for reading
*/
- set_bitmap_uptodate(bh);
- if (bh_submit_read(bh) < 0) {
+ trace_ext4_load_inode_bitmap(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error(sb, "Cannot read inode bitmap - "
- "block_group = %u, inode_bitmap = %llu",
- block_group, bitmap_blk);
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
+
+verify:
+ ext4_lock_group(sb, block_group);
+ if (!buffer_verified(bh) &&
+ !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ put_bh(bh);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, bitmap_blk);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return NULL;
+ }
+ ext4_unlock_group(sb, block_group);
+ set_buffer_verified(bh);
return bh;
}
@@ -192,20 +239,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
+ struct ext4_group_info *grp;
- if (atomic_read(&inode->i_count) > 1) {
- printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
- atomic_read(&inode->i_count));
+ if (!sb) {
+ printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+ "nonexistent device\n", __func__, __LINE__);
return;
}
- if (inode->i_nlink) {
- printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
- inode->i_nlink);
+ if (atomic_read(&inode->i_count) > 1) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+ __func__, __LINE__, inode->i_ino,
+ atomic_read(&inode->i_count));
return;
}
- if (!sb) {
- printk(KERN_ERR "ext4_free_inode: inode on "
- "nonexistent device\n");
+ if (inode->i_nlink) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+ __func__, __LINE__, inode->i_ino, inode->i_nlink);
return;
}
sbi = EXT4_SB(sb);
@@ -236,7 +285,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh)
+ /* Don't bother if the inode bitmap is corrupt. */
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
goto error_return;
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -251,7 +302,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
fatal = ext4_journal_get_write_access(handle, bh2);
}
ext4_lock_group(sb, block_group);
- cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
if (fatal || !cleared) {
ext4_unlock_group(sb, block_group);
goto out;
@@ -264,7 +315,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_used_dirs_set(sb, gdp, count);
percpu_counter_dec(&sbi->s_dirs_counter);
}
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
percpu_counter_inc(&sbi->s_freeinodes_counter);
@@ -283,130 +336,25 @@ out:
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!fatal)
fatal = err;
- ext4_mark_super_dirty(sb);
- } else
+ } else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
+ if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ }
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
}
-/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
- ext4_group_t *best_group)
-{
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- unsigned int freei, avefreei;
- struct ext4_group_desc *desc, *best_desc = NULL;
- ext4_group_t group;
- int ret = -1;
-
- freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
-
- for (group = 0; group < ngroups; group++) {
- desc = ext4_get_group_desc(sb, group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
- continue;
- if (!best_desc ||
- (ext4_free_blks_count(sb, desc) >
- ext4_free_blks_count(sb, best_desc))) {
- *best_group = group;
- best_desc = desc;
- ret = 0;
- }
- }
- return ret;
-}
-
-#define free_block_ratio 10
-
-static int find_group_flex(struct super_block *sb, struct inode *parent,
- ext4_group_t *best_group)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_group_desc *desc;
- struct flex_groups *flex_group = sbi->s_flex_groups;
- ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- int flex_size = ext4_flex_bg_size(sbi);
- ext4_group_t best_flex = parent_fbg_group;
- int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
- int flexbg_free_blocks;
- int flex_freeb_ratio;
- ext4_group_t n_fbg_groups;
- ext4_group_t i;
-
- n_fbg_groups = (ngroups + flex_size - 1) >>
- sbi->s_log_groups_per_flex;
-
-find_close_to_parent:
- flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
- flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
- if (atomic_read(&flex_group[best_flex].free_inodes) &&
- flex_freeb_ratio > free_block_ratio)
- goto found_flexbg;
-
- if (best_flex && best_flex == parent_fbg_group) {
- best_flex--;
- goto find_close_to_parent;
- }
-
- for (i = 0; i < n_fbg_groups; i++) {
- if (i == parent_fbg_group || i == parent_fbg_group - 1)
- continue;
-
- flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
- flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-
- if (flex_freeb_ratio > free_block_ratio &&
- (atomic_read(&flex_group[i].free_inodes))) {
- best_flex = i;
- goto found_flexbg;
- }
-
- if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
- ((atomic_read(&flex_group[i].free_blocks) >
- atomic_read(&flex_group[best_flex].free_blocks)) &&
- atomic_read(&flex_group[i].free_inodes)))
- best_flex = i;
- }
-
- if (!atomic_read(&flex_group[best_flex].free_inodes) ||
- !atomic_read(&flex_group[best_flex].free_blocks))
- return -1;
-
-found_flexbg:
- for (i = best_flex * flex_size; i < ngroups &&
- i < (best_flex + 1) * flex_size; i++) {
- desc = ext4_get_group_desc(sb, i, NULL);
- if (ext4_free_inodes_count(sb, desc)) {
- *best_group = i;
- goto out;
- }
- }
-
- return -1;
-out:
- return 0;
-}
-
struct orlov_stats {
+ __u64 free_clusters;
__u32 free_inodes;
- __u32 free_blocks;
__u32 used_dirs;
};
@@ -423,7 +371,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
if (flex_size > 1) {
stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
- stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+ stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
return;
}
@@ -431,11 +379,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
desc = ext4_get_group_desc(sb, g, NULL);
if (desc) {
stats->free_inodes = ext4_free_inodes_count(sb, desc);
- stats->free_blocks = ext4_free_blks_count(sb, desc);
+ stats->free_clusters = ext4_free_group_clusters(sb, desc);
stats->used_dirs = ext4_used_dirs_count(sb, desc);
} else {
stats->free_inodes = 0;
- stats->free_blocks = 0;
+ stats->free_clusters = 0;
stats->used_dirs = 0;
}
}
@@ -462,18 +410,18 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*/
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode,
+ ext4_group_t *group, umode_t mode,
const struct qstr *qstr)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
- unsigned int freei, avefreei;
- ext4_fsblk_t freeb, avefreeb;
+ unsigned int freei, avefreei, grp_free;
+ ext4_fsblk_t freeb, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
- ext4_grpblk_t min_blocks;
+ ext4_grpblk_t min_clusters;
ext4_group_t i, grp, g, ngroups;
struct ext4_group_desc *desc;
struct orlov_stats stats;
@@ -489,9 +437,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- avefreeb = freeb;
- do_div(avefreeb, ngroups);
+ freeb = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ avefreec = freeb;
+ do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
if (S_ISDIR(mode) &&
@@ -506,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
grp = hinfo.hash;
} else
- get_random_bytes(&grp, sizeof(grp));
+ grp = prandom_u32();
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
@@ -517,7 +466,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
continue;
if (stats.free_inodes < avefreei)
continue;
- if (stats.free_blocks < avefreeb)
+ if (stats.free_clusters < avefreec)
continue;
grp = g;
ret = 0;
@@ -555,7 +504,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+ min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
/*
* Start looking in the flex group where we last allocated an
@@ -574,7 +523,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
continue;
if (stats.free_inodes < min_inodes)
continue;
- if (stats.free_blocks < min_blocks)
+ if (stats.free_clusters < min_clusters)
continue;
goto found_flex_bg;
}
@@ -587,10 +536,12 @@ fallback_retry:
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei) {
- *group = grp;
- return 0;
+ if (desc) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (grp_free && grp_free >= avefreei) {
+ *group = grp;
+ return 0;
+ }
}
}
@@ -607,7 +558,7 @@ fallback_retry:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode)
+ ext4_group_t *group, umode_t mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
@@ -649,7 +600,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
- return find_group_orlov(sb, parent, group, mode, 0);
+ return find_group_orlov(sb, parent, group, mode, NULL);
}
/*
@@ -658,7 +609,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_blks_count(sb, desc))
+ ext4_free_group_clusters(sb, desc))
return 0;
/*
@@ -682,7 +633,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_blks_count(sb, desc))
+ ext4_free_group_clusters(sb, desc))
return 0;
}
@@ -703,91 +654,48 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
- * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's ext4_group_lock
- * and clear the uninit flag. The inode bitmap update
- * and group desc uninit flag clear should be done
- * after holding ext4_group_lock so that ext4_read_inode_bitmap
- * doesn't race with the ext4_claim_inode
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk. (Yes, these values are
+ * somewhat arbitrary...)
*/
-static int ext4_claim_inode(struct super_block *sb,
- struct buffer_head *inode_bitmap_bh,
- unsigned long ino, ext4_group_t group, int mode)
-{
- int free = 0, retval = 0, count;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+#define RECENTCY_MIN 5
+#define RECENTCY_DIRTY 30
- /*
- * We have to be sure that new inode allocation does not race with
- * inode table initialization, because otherwise we may end up
- * allocating and writing new inode right before sb_issue_zeroout
- * takes place and overwriting our new inode with zeroes. So we
- * take alloc_sem to prevent it.
- */
- down_read(&grp->alloc_sem);
- ext4_lock_group(sb, group);
- if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
- /* not a free inode */
- retval = 1;
- goto err_ret;
- }
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_unlock_group(sb, group);
- up_read(&grp->alloc_sem);
- ext4_error(sb, "reserved inode or inode > inodes count - "
- "block_group = %u, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- return 1;
- }
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unused even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
- free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
- }
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_inode *raw_inode;
+ struct buffer_head *bh;
+ unsigned long dtime, now;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0, recentcy = RECENTCY_MIN;
+
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (unlikely(!gdp))
+ return 0;
+ bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ (ino / inodes_per_block));
+ if (unlikely(!bh) || !buffer_uptodate(bh))
/*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
+ * If the block is not in the buffer cache, then it
+ * must have been written out.
*/
- if (ino > free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - ino));
- }
- count = ext4_free_inodes_count(sb, gdp) - 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (S_ISDIR(mode)) {
- count = ext4_used_dirs_count(sb, gdp) + 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
+ goto out;
- atomic_inc(&sbi->s_flex_groups[f].used_dirs);
- }
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-err_ret:
- ext4_unlock_group(sb, group);
- up_read(&grp->alloc_sem);
- return retval;
+ offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+ dtime = le32_to_cpu(raw_inode->i_dtime);
+ now = get_seconds();
+ if (buffer_dirty(bh))
+ recentcy += RECENTCY_DIRTY;
+
+ if (dtime && (dtime < now) && (now < dtime + recentcy))
+ ret = 1;
+out:
+ brelse(bh);
+ return ret;
}
/*
@@ -800,8 +708,10 @@ err_ret:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
- const struct qstr *qstr, __u32 goal)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ umode_t mode, const struct qstr *qstr,
+ __u32 goal, uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -815,9 +725,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
- int free = 0;
- static int once = 1;
ext4_group_t flex_group;
+ struct ext4_group_info *grp;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -832,6 +741,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
+ /*
+ * Initalize owners and quota early so that we don't have to account
+ * for quota initialization worst case in standard inode creating
+ * transaction
+ */
+ if (owner) {
+ inode->i_mode = mode;
+ i_uid_write(inode, owner[0]);
+ i_gid_write(inode, owner[1]);
+ } else if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ } else
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -842,26 +768,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
goto got_group;
}
- if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
- ret2 = find_group_flex(sb, dir, &group);
- if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group, mode);
- if (ret2 == 0 && once) {
- once = 0;
- printk(KERN_NOTICE "ext4: find_group_flex "
- "failed, fallback succeeded dir %lu\n",
- dir->i_ino);
- }
- }
- goto got_group;
- }
-
- if (S_ISDIR(mode)) {
- if (test_opt(sb, OLDALLOC))
- ret2 = find_group_dir(sb, dir, &group);
- else
- ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
- } else
+ if (S_ISDIR(mode))
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+ else
ret2 = find_group_other(sb, dir, &group, mode);
got_group:
@@ -870,65 +779,87 @@ got_group:
if (ret2 == -1)
goto out;
+ /*
+ * Normally we will only go through one pass of this loop,
+ * unless we get unlucky and it turns out the group we selected
+ * had its last inode grabbed by someone else.
+ */
for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
- goto fail;
+ goto out;
+
+ /*
+ * Check free inodes count before loading bitmap.
+ */
+ if (ext4_free_inodes_count(sb, gdp) == 0) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
+ grp = ext4_get_group_info(sb, group);
+ /* Skip groups with already-known suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
- if (!inode_bitmap_bh)
- goto fail;
+ /* Skip groups with suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb), ino);
-
- if (ino < EXT4_INODES_PER_GROUP(sb)) {
-
- BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle,
- inode_bitmap_bh);
- if (err)
- goto fail;
-
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle,
- group_desc_bh);
- if (err)
- goto fail;
- if (!ext4_claim_inode(sb, inode_bitmap_bh,
- ino, group, mode)) {
- /* we won it */
- BUFFER_TRACE(inode_bitmap_bh,
- "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle,
- NULL,
- inode_bitmap_bh);
- if (err)
- goto fail;
- /* zero bit is inode number 1*/
- ino++;
- goto got;
+ if (ino >= EXT4_INODES_PER_GROUP(sb))
+ goto next_group;
+ if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+ ext4_error(sb, "reserved inode found cleared - "
+ "inode=%lu", ino + 1);
+ continue;
+ }
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, ino)) {
+ ino++;
+ goto next_inode;
+ }
+ if (!handle) {
+ BUG_ON(nblocks <= 0);
+ handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+ handle_type, nblocks,
+ 0);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ ext4_std_error(sb, err);
+ goto out;
}
- /* we lost it */
- ext4_handle_release_buffer(handle, inode_bitmap_bh);
- ext4_handle_release_buffer(handle, group_desc_bh);
-
- if (++ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
}
-
- /*
- * This case is possible in concurrent environment. It is very
- * rare. We cannot repeat the find_group_xxx() call because
- * that will simply return the same blockgroup, because the
- * group descriptor metadata has not yet been updated.
- * So we just go onto the next blockgroup.
- */
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ ext4_lock_group(sb, group);
+ ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ ext4_unlock_group(sb, group);
+ ino++; /* the inode bitmap is zero-based */
+ if (!ret2)
+ goto got; /* we grabbed the inode! */
+next_inode:
+ if (ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+next_group:
if (++group == ngroups)
group = 0;
}
@@ -936,8 +867,22 @@ repeat_in_this_group:
goto out;
got:
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
/* We may have to initialize the block bitmap if it isn't already */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+ if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
struct buffer_head *block_bitmap_bh;
@@ -946,54 +891,90 @@ got:
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
brelse(block_bitmap_bh);
- goto fail;
+ ext4_std_error(sb, err);
+ goto out;
}
- free = 0;
- ext4_lock_group(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+
/* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- free = ext4_free_blocks_after_init(sb, group, gdp);
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp, free);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
- gdp);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
- /* Don't need to dirty bitmap block if we didn't change it */
- if (free) {
- BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
- err = ext4_handle_dirty_metadata(handle,
- NULL, block_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
+ }
- brelse(block_bitmap_bh);
- if (err)
- goto fail;
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ up_read(&grp->alloc_sem);
+ } else {
+ ext4_lock_group(sb, group);
}
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ }
+ }
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
- if (err)
- goto fail;
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- ext4_mark_super_dirty(sb);
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
- if (test_opt(sb, GRPID)) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = dir->i_gid;
- } else
- inode_init_owner(inode, dir, mode);
-
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
@@ -1004,11 +985,7 @@ got:
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
- /*
- * Don't inherit extent flag from directory, amongst others. We set
- * extent flag on newly created directory and file only if -o extent
- * mount option is specified
- */
+ /* Don't inherit extent flag from directory, amongst others. */
ei->i_flags =
ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_file_acl = 0;
@@ -1020,20 +997,41 @@ got:
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+ inode->i_ino);
+ goto out;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- ei->i_state_flags = 0;
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = cpu_to_le32(inode->i_generation);
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+ ei->i_inline_off = 0;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
ret = inode;
- dquot_initialize(inode);
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
@@ -1042,7 +1040,7 @@ got:
if (err)
goto fail_free_drop;
- err = ext4_init_security(handle, inode, dir);
+ err = ext4_init_security(handle, inode, dir, qstr);
if (err)
goto fail_free_drop;
@@ -1054,6 +1052,11 @@ got:
}
}
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
@@ -1062,24 +1065,17 @@ got:
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
- goto really_out;
-fail:
- ext4_std_error(sb, err);
-out:
- iput(inode);
- ret = ERR_PTR(err);
-really_out:
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
-
fail_drop:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+out:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
- unlock_new_inode(inode);
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
@@ -1138,17 +1134,17 @@ iget_failed:
inode = NULL;
bad_orphan:
ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
bit, (unsigned long long)bitmap_bh->b_blocknr,
ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
+ printk(KERN_WARNING "inode=%p\n", inode);
if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
- printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+ printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
@@ -1183,7 +1179,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
if (!bitmap_bh)
continue;
- x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
@@ -1227,9 +1224,9 @@ unsigned long ext4_count_dirs(struct super_block * sb)
* where it is called from on active part of filesystem is ext4lazyinit
* thread, so we do not need any special locks, however we have to prevent
* inode allocation from the current group, so we take alloc_sem lock, to
- * block ext4_claim_inode until we are finished.
+ * block ext4_new_inode() until we are finished.
*/
-extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
int barrier)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -1257,7 +1254,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
goto out;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -1275,13 +1272,13 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
sbi->s_inodes_per_block);
if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
- ext4_error(sb, "Something is wrong with group %u\n"
- "Used itable blocks: %d"
- "itable unused count: %u\n",
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
group, used_blks,
ext4_itable_unused_count(sb, gdp));
ret = 1;
- goto out;
+ goto err_out;
}
blk = ext4_inode_table(sb, gdp) + used_blks;
@@ -1312,7 +1309,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
skip_zeroout:
ext4_lock_group(sb, group);
gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
BUFFER_TRACE(group_desc_bh,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 00000000000..fd69da19482
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1391 @@
+/*
+ * linux/fs/ext4/indirect.c
+ *
+ * from
+ *
+ * linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Goal-directed block allocation by Stephen Tweedie
+ * (sct@redhat.com), 1993, 1998
+ */
+
+#include <linux/aio.h>
+#include "ext4_jbd2.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+typedef struct {
+ __le32 *p;
+ __le32 key;
+ struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+ p->key = *(p->p = v);
+ p->bh = bh;
+}
+
+/**
+ * ext4_block_to_path - parse the block number into array of offsets
+ * @inode: inode in question (we are only interested in its superblock)
+ * @i_block: block number to be parsed
+ * @offsets: array to store the offsets in
+ * @boundary: set this non-zero if the referred-to block is likely to be
+ * followed (on disk) by an indirect block.
+ *
+ * To store the locations of file's data ext4 uses a data structure common
+ * for UNIX filesystems - tree of pointers anchored in the inode, with
+ * data blocks at leaves and indirect blocks in intermediate nodes.
+ * This function translates the block number into path in that tree -
+ * return value is the path length and @offsets[n] is the offset of
+ * pointer to (n+1)th node in the nth one. If @block is out of range
+ * (negative or too large) warning is printed and zero returned.
+ *
+ * Note: function doesn't find node addresses, so no IO is needed. All
+ * we need to know is the capacity of indirect blocks (taken from the
+ * inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+ ext4_lblk_t i_block,
+ ext4_lblk_t offsets[4], int *boundary)
+{
+ int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ const long direct_blocks = EXT4_NDIR_BLOCKS,
+ indirect_blocks = ptrs,
+ double_blocks = (1 << (ptrs_bits * 2));
+ int n = 0;
+ int final = 0;
+
+ if (i_block < direct_blocks) {
+ offsets[n++] = i_block;
+ final = direct_blocks;
+ } else if ((i_block -= direct_blocks) < indirect_blocks) {
+ offsets[n++] = EXT4_IND_BLOCK;
+ offsets[n++] = i_block;
+ final = ptrs;
+ } else if ((i_block -= indirect_blocks) < double_blocks) {
+ offsets[n++] = EXT4_DIND_BLOCK;
+ offsets[n++] = i_block >> ptrs_bits;
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ offsets[n++] = EXT4_TIND_BLOCK;
+ offsets[n++] = i_block >> (ptrs_bits * 2);
+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else {
+ ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+ i_block + direct_blocks +
+ indirect_blocks + double_blocks, inode->i_ino);
+ }
+ if (boundary)
+ *boundary = final - 1 - (i_block & (ptrs - 1));
+ return n;
+}
+
+/**
+ * ext4_get_branch - read the chain of indirect blocks leading to data
+ * @inode: inode in question
+ * @depth: depth of the chain (1 - direct pointer, etc.)
+ * @offsets: offsets of pointers in inode/indirect blocks
+ * @chain: place to store the result
+ * @err: here we store the error value
+ *
+ * Function fills the array of triples <key, p, bh> and returns %NULL
+ * if everything went OK or the pointer to the last filled triple
+ * (incomplete one) otherwise. Upon the return chain[i].key contains
+ * the number of (i+1)-th block in the chain (as it is stored in memory,
+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
+ * number (it points into struct inode for i==0 and into the bh->b_data
+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ * block for i>0 and NULL for i==0. In other words, it holds the block
+ * numbers of the chain, addresses they were taken from (and where we can
+ * verify that chain did not change) and buffer_heads hosting these
+ * numbers.
+ *
+ * Function stops when it stumbles upon zero pointer (absent block)
+ * (pointer to last triple returned, *@err == 0)
+ * or when it gets an IO error reading an indirect block
+ * (ditto, *@err == -EIO)
+ * or when it reads all @depth-1 indirect blocks successfully and finds
+ * the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+ ext4_lblk_t *offsets,
+ Indirect chain[4], int *err)
+{
+ struct super_block *sb = inode->i_sb;
+ Indirect *p = chain;
+ struct buffer_head *bh;
+ int ret = -EIO;
+
+ *err = 0;
+ /* i_data is not going away, no lock needed */
+ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+ if (!p->key)
+ goto no_block;
+ while (--depth) {
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
+ goto failure;
+ }
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
+ add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+ /* Reader: end */
+ if (!p->key)
+ goto no_block;
+ }
+ return NULL;
+
+failure:
+ *err = ret;
+no_block:
+ return p;
+}
+
+/**
+ * ext4_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the preferred place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same
+ * cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group. The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+ __le32 *p;
+
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--) {
+ if (*p)
+ return le32_to_cpu(*p);
+ }
+
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
+
+ /*
+ * It is going to be referred to from the inode itself? OK, just put it
+ * into the same cylinder group then.
+ */
+ return ext4_inode_to_goal_block(inode);
+}
+
+/**
+ * ext4_find_goal - find a preferred place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @partial: pointer to the last triple within a chain
+ *
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+ Indirect *partial)
+{
+ ext4_fsblk_t goal;
+
+ /*
+ * XXX need to get goal block from mballoc's data structures
+ */
+
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
+}
+
+/**
+ * ext4_blks_to_allocate - Look up the block map and count the number
+ * of direct blocks need to be allocated for the given branch.
+ *
+ * @branch: chain of indirect blocks
+ * @k: number of blocks need for indirect blocks
+ * @blks: number of data blocks to be mapped.
+ * @blocks_to_boundary: the offset in the indirect block
+ *
+ * return the total number of blocks to be allocate, including the
+ * direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+ int blocks_to_boundary)
+{
+ unsigned int count = 0;
+
+ /*
+ * Simple case, [t,d]Indirect block(s) has not allocated yet
+ * then it's clear blocks on that path have not allocated
+ */
+ if (k > 0) {
+ /* right now we don't handle cross boundary allocation */
+ if (blks < blocks_to_boundary + 1)
+ count += blks;
+ else
+ count += blocks_to_boundary + 1;
+ return count;
+ }
+
+ count++;
+ while (count < blks && count <= blocks_to_boundary &&
+ le32_to_cpu(*(branch[0].p + count)) == 0) {
+ count++;
+ }
+ return count;
+}
+
+/**
+ * ext4_alloc_branch - allocate and set up a chain of blocks.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @indirect_blks: number of allocated indirect blocks
+ * @blks: number of allocated direct blocks
+ * @goal: preferred place for allocation
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
+ *
+ * This function allocates blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext4_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext4_get_block(), except that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * their buffer_heads) and return the error value the from failed
+ * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
+{
+ struct ext4_allocation_request ar;
+ struct buffer_head * bh;
+ ext4_fsblk_t b, new_blocks[4];
+ __le32 *p;
+ int i, j, err, len = 1;
+
+ /*
+ * Set up for the direct block allocation
+ */
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.len = *blks;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ for (i = 0; i <= indirect_blks; i++) {
+ if (i == indirect_blks) {
+ ar.goal = goal;
+ new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+ } else
+ goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+ goal, 0, NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+ }
+ branch[i].key = cpu_to_le32(new_blocks[i]);
+ if (i == 0)
+ continue;
+
+ bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err) {
+ unlock_buffer(bh);
+ goto failed;
+ }
+
+ memset(bh->b_data, 0, bh->b_size);
+ p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+ b = new_blocks[i];
+
+ if (i == indirect_blks)
+ len = ar.len;
+ for (j = 0; j < len; j++)
+ *p++ = cpu_to_le32(b++);
+
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto failed;
+ }
+ *blks = ar.len;
+ return 0;
+failed:
+ for (; i >= 0; i--) {
+ /*
+ * We want to ext4_forget() only freshly allocated indirect
+ * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
+ * buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called.
+ */
+ if (i > 0 && i != indirect_blks && branch[i].bh)
+ ext4_forget(handle, 1, inode, branch[i].bh,
+ branch[i].bh->b_blocknr);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+ (i == indirect_blks) ? ar.len : 1, 0);
+ }
+ return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext4_alloc_branch)
+ * @where: location of missing link
+ * @num: number of indirect blocks we are adding
+ * @blks: number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t block, Indirect *where, int num,
+ int blks)
+{
+ int i;
+ int err = 0;
+ ext4_fsblk_t current_block;
+
+ /*
+ * If we're splicing into a [td]indirect block (as opposed to the
+ * inode) then we need to get write access to the [td]indirect block
+ * before the splice.
+ */
+ if (where->bh) {
+ BUFFER_TRACE(where->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, where->bh);
+ if (err)
+ goto err_out;
+ }
+ /* That's it */
+
+ *where->p = where->key;
+
+ /*
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+ if (num == 0 && blks > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+ for (i = 1; i < blks; i++)
+ *(where->p + i) = cpu_to_le32(current_block++);
+ }
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ /*
+ * If we spliced it onto an indirect block, we haven't
+ * altered the inode. Note however that if it is being spliced
+ * onto an indirect block at the very end of the file (the
+ * file is growing) then we *will* alter the inode to reflect
+ * the new i_size. But that is not done here - it is done in
+ * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+
+err_out:
+ for (i = 1; i <= num; i++) {
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+ blks, 0);
+
+ return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int err = -EIO;
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ ext4_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
+ int count = 0;
+ ext4_fsblk_t first_block = 0;
+
+ trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+ &blocks_to_boundary);
+
+ if (depth == 0)
+ goto out;
+
+ partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+ /* Simplest case - block found, no allocation needed */
+ if (!partial) {
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ count++;
+ /*map more blocks*/
+ while (count < map->m_len && count <= blocks_to_boundary) {
+ ext4_fsblk_t blk;
+
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
+ count++;
+ else
+ break;
+ }
+ goto got_it;
+ }
+
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ goto cleanup;
+
+ /*
+ * Okay, we need to do block allocation.
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+ "non-extent mapped inodes with bigalloc");
+ return -ENOSPC;
+ }
+
+ goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
+
+ /*
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
+ */
+ count = ext4_blks_to_allocate(partial, indirect_blks,
+ map->m_len, blocks_to_boundary);
+ /*
+ * Block out ext4_truncate while we alter the tree
+ */
+ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
+
+ /*
+ * The ext4_splice_branch call will free and forget any buffers
+ * on the new chain if there is a failure, but that risks using
+ * up transaction credits, especially for bitmaps where the
+ * credits cannot be returned. Can we handle this somehow? We
+ * may need to return -EAGAIN upwards in the worst case. --sct
+ */
+ if (!err)
+ err = ext4_splice_branch(handle, inode, map->m_lblk,
+ partial, indirect_blks, count);
+ if (err)
+ goto cleanup;
+
+ map->m_flags |= EXT4_MAP_NEW;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+ map->m_len = count;
+ if (count > blocks_to_boundary)
+ map->m_flags |= EXT4_MAP_BOUNDARY;
+ err = count;
+ /* Clean up and exit */
+ partial = chain + depth - 1; /* the whole chain */
+cleanup:
+ while (partial > chain) {
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+out:
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
+ return err;
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle;
+ ssize_t ret;
+ int orphan = 0;
+ size_t count = iov_iter_count(iter);
+ int retries = 0;
+
+ if (rw == WRITE) {
+ loff_t final_size = offset + count;
+
+ if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ ext4_journal_stop(handle);
+ }
+ }
+
+retry:
+ if (rw == READ && ext4_should_dioread_nolock(inode)) {
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
+ } else {
+locked:
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
+
+ if (unlikely((rw & WRITE) && ret < 0)) {
+ loff_t isize = i_size_read(inode);
+ loff_t end = offset + count;
+
+ if (end > isize)
+ ext4_truncate_failed_write(inode);
+ }
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ if (orphan) {
+ int err;
+
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ goto out;
+ }
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+ int blk_bits;
+
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
+
+ lblock -= EXT4_NDIR_BLOCKS;
+
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = order_base_2(lblock);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
+{
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge. So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * Try to extend this transaction for the purposes of truncation. If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ *
+ * Returns 0 if we managed to create more room. If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+ return 0;
+ if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+ return 0;
+ return 1;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+ while (p < q)
+ if (*p++)
+ return 0;
+ return 1;
+}
+
+/**
+ * ext4_find_shared - find the indirect blocks for partial truncation.
+ * @inode: inode in question
+ * @depth: depth of the affected branch
+ * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ * @chain: place to store the pointers to partial indirect blocks
+ * @top: place to the (detached) top of branch
+ *
+ * This is a helper function used by ext4_truncate().
+ *
+ * When we do truncate() we may have to clean the ends of several
+ * indirect blocks but leave the blocks themselves alive. Block is
+ * partially truncated if some data below the new i_size is referred
+ * from it (and it is on the path to the first completely truncated
+ * data block, indeed). We have to free the top of that path along
+ * with everything to the right of the path. Since no allocation
+ * past the truncation point is possible until ext4_truncate()
+ * finishes, we may safely do the latter, but top of branch may
+ * require special attention - pageout below the truncation point
+ * might try to populate it.
+ *
+ * We atomically detach the top of branch from the tree, store the
+ * block number of its root in *@top, pointers to buffer_heads of
+ * partially truncated blocks - in @chain[].bh and pointers to
+ * their last elements that should not be removed - in
+ * @chain[].p. Return value is the pointer to last filled element
+ * of @chain.
+ *
+ * The work left to caller to do the actual freeing of subtrees:
+ * a) free the subtree starting from *@top
+ * b) free the subtrees whose roots are stored in
+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ * c) free the subtrees growing from the inode past the @chain[0].
+ * (no partially truncated stuff there). */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+ ext4_lblk_t offsets[4], Indirect chain[4],
+ __le32 *top)
+{
+ Indirect *partial, *p;
+ int k, err;
+
+ *top = 0;
+ /* Make k index the deepest non-null offset + 1 */
+ for (k = depth; k > 1 && !offsets[k-1]; k--)
+ ;
+ partial = ext4_get_branch(inode, k, offsets, chain, &err);
+ /* Writer: pointers */
+ if (!partial)
+ partial = chain + k-1;
+ /*
+ * If the branch acquired continuation since we've looked at it -
+ * fine, it should all survive and (new) top doesn't belong to us.
+ */
+ if (!partial->key && *partial->p)
+ /* Writer: end */
+ goto no_top;
+ for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+ ;
+ /*
+ * OK, we've found the last block that must survive. The rest of our
+ * branch should be detached before unlocking. However, if that rest
+ * of branch is all ours and does not grow immediately from the inode
+ * it's easier to cheat and just decrement partial->p.
+ */
+ if (p == chain + k - 1 && p > chain) {
+ p->p--;
+ } else {
+ *top = *p->p;
+ /* Nope, don't do this in ext4. Must leave the tree intact */
+#if 0
+ *p->p = 0;
+#endif
+ }
+ /* Writer: end */
+
+ while (partial > p) {
+ brelse(partial->bh);
+ partial--;
+ }
+no_top:
+ return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
+{
+ __le32 *p;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
+ int err;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+ count)) {
+ EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+ "blocks %llu len %lu",
+ (unsigned long long) block_to_free, count);
+ return 1;
+ }
+
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ goto out_err;
+ err = ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ if (unlikely(err))
+ goto out_err;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ }
+
+ for (p = first; p < last; p++)
+ *p = 0;
+
+ ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+ return 0;
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle: handle for this transaction
+ * @inode: inode we are dealing with
+ * @this_bh: indirect buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *this_bh,
+ __le32 *first, __le32 *last)
+{
+ ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
+ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
+ corresponding to
+ block_to_free */
+ ext4_fsblk_t nr; /* Current block # */
+ __le32 *p; /* Pointer into inode/ind
+ for current block */
+ int err = 0;
+
+ if (this_bh) { /* For indirect block */
+ BUFFER_TRACE(this_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, this_bh);
+ /* Important: if we can't update the indirect pointers
+ * to the blocks, we can't free them. */
+ if (err)
+ return;
+ }
+
+ for (p = first; p < last; p++) {
+ nr = le32_to_cpu(*p);
+ if (nr) {
+ /* accumulate blocks to free if they're contiguous */
+ if (count == 0) {
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ } else if (nr == block_to_free + count) {
+ count++;
+ } else {
+ err = ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free, count,
+ block_to_free_p, p);
+ if (err)
+ break;
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ }
+ }
+ }
+
+ if (!err && count > 0)
+ err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+ count, block_to_free_p, p);
+ if (err < 0)
+ /* fatal error */
+ return;
+
+ if (this_bh) {
+ BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+ ext4_handle_dirty_metadata(handle, inode, this_bh);
+ else
+ EXT4_ERROR_INODE(inode,
+ "circular indirect block detected at "
+ "block %llu",
+ (unsigned long long) this_bh->b_blocknr);
+ }
+}
+
+/**
+ * ext4_free_branches - free an array of branches
+ * @handle: JBD handle for this transaction
+ * @inode: inode we are dealing with
+ * @parent_bh: the buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: pointer immediately past the end of array
+ * @depth: depth of the branches to free
+ *
+ * We are freeing all blocks referred from these branches (numbers are
+ * stored as little-endian 32-bit) and updating @inode->i_blocks
+ * appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh,
+ __le32 *first, __le32 *last, int depth)
+{
+ ext4_fsblk_t nr;
+ __le32 *p;
+
+ if (ext4_handle_is_aborted(handle))
+ return;
+
+ if (depth--) {
+ struct buffer_head *bh;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ p = last;
+ while (--p >= first) {
+ nr = le32_to_cpu(*p);
+ if (!nr)
+ continue; /* A hole */
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ nr, 1)) {
+ EXT4_ERROR_INODE(inode,
+ "invalid indirect mapped "
+ "block %lu (level %d)",
+ (unsigned long) nr, depth);
+ break;
+ }
+
+ /* Go read the buffer for the next level down */
+ bh = sb_bread(inode->i_sb, nr);
+
+ /*
+ * A read failure? Report error and clear slot
+ * (should be rare).
+ */
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, nr,
+ "Read failure");
+ continue;
+ }
+
+ /* This zaps the entire block. Bottom up. */
+ BUFFER_TRACE(bh, "free child branches");
+ ext4_free_branches(handle, inode, bh,
+ (__le32 *) bh->b_data,
+ (__le32 *) bh->b_data + addr_per_block,
+ depth);
+ brelse(bh);
+
+ /*
+ * Everything below this this pointer has been
+ * released. Now let this top-of-subtree go.
+ *
+ * We want the freeing of this indirect block to be
+ * atomic in the journal with the updating of the
+ * bitmap block which owns it. So make some room in
+ * the journal.
+ *
+ * We zero the parent pointer *after* freeing its
+ * pointee in the bitmaps, so if extend_transaction()
+ * for some reason fails to put the bitmap changes and
+ * the release into the same transaction, recovery
+ * will merely complain about releasing a free block,
+ * rather than leaking blocks.
+ */
+ if (ext4_handle_is_aborted(handle))
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ }
+
+ /*
+ * The forget flag here is critical because if
+ * we are journaling (and not doing data
+ * journaling), we have to make sure a revoke
+ * record is written to prevent the journal
+ * replay from overwriting the (former)
+ * indirect block if it gets reallocated as a
+ * data block. This must happen in the same
+ * transaction where the data blocks are
+ * actually freed.
+ */
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA|
+ EXT4_FREE_BLOCKS_FORGET);
+
+ if (parent_bh) {
+ /*
+ * The block which we have just freed is
+ * pointed to by an indirect block: journal it
+ */
+ BUFFER_TRACE(parent_bh, "get_write_access");
+ if (!ext4_journal_get_write_access(handle,
+ parent_bh)){
+ *p = 0;
+ BUFFER_TRACE(parent_bh,
+ "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle,
+ inode,
+ parent_bh);
+ }
+ }
+ }
+ } else {
+ /* We have reached the bottom of the tree. */
+ BUFFER_TRACE(parent_bh, "free data blocks");
+ ext4_free_data(handle, inode, parent_bh, first, last);
+ }
+}
+
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *i_data = ei->i_data;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ __le32 nr = 0;
+ int n = 0;
+ ext4_lblk_t last_block, max_block;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+
+ last_block = (inode->i_size + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+ if (last_block != max_block) {
+ n = ext4_block_to_path(inode, last_block, offsets, NULL);
+ if (n == 0)
+ return;
+ }
+
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
+ /*
+ * The orphan list entry will now protect us from any crash which
+ * occurs before the truncate completes, so it is now safe to propagate
+ * the new, shorter inode size (held for now in i_size) into the
+ * on-disk inode. We do this via i_disksize, which is the value which
+ * ext4 *really* writes onto the disk inode.
+ */
+ ei->i_disksize = inode->i_size;
+
+ if (last_block == max_block) {
+ /*
+ * It is unnecessary to free any data blocks if last_block is
+ * equal to the indirect block limit.
+ */
+ return;
+ } else if (n == 1) { /* direct blocks */
+ ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+ i_data + EXT4_NDIR_BLOCKS);
+ goto do_indirects;
+ }
+
+ partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ /* Kill the top of shared branch (not detached) */
+ if (nr) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1, (chain+n-1) - partial);
+ *partial->p = 0;
+ /*
+ * We mark the inode dirty prior to restart,
+ * and prior to stop. No need for it here.
+ */
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p,
+ partial->p+1, (chain+n-1) - partial);
+ }
+ }
+ /* Clear the ends of indirect blocks on the shared branch */
+ while (partial > chain) {
+ ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+ (__le32*)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+do_indirects:
+ /* Kill the remaining (whole) subtrees */
+ switch (offsets[0]) {
+ default:
+ nr = i_data[EXT4_IND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+ i_data[EXT4_IND_BLOCK] = 0;
+ }
+ case EXT4_IND_BLOCK:
+ nr = i_data[EXT4_DIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+ i_data[EXT4_DIND_BLOCK] = 0;
+ }
+ case EXT4_DIND_BLOCK:
+ nr = i_data[EXT4_TIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+ i_data[EXT4_TIND_BLOCK] = 0;
+ }
+ case EXT4_TIND_BLOCK:
+ ;
+ }
+}
+
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh, __le32 *i_data,
+ int level, ext4_lblk_t first,
+ ext4_lblk_t count, int max)
+{
+ struct buffer_head *bh = NULL;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ret = 0;
+ int i, inc;
+ ext4_lblk_t offset;
+ __le32 blk;
+
+ inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+ for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+ if (offset >= count + first)
+ break;
+ if (*i_data == 0 || (offset + inc) <= first)
+ continue;
+ blk = *i_data;
+ if (level > 0) {
+ ext4_lblk_t first2;
+ ext4_lblk_t count2;
+
+ bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
+ "Read failure");
+ return -EIO;
+ }
+ if (first > offset) {
+ first2 = first - offset;
+ count2 = count;
+ } else {
+ first2 = 0;
+ count2 = count - (offset - first);
+ }
+ ret = free_hole_blocks(handle, inode, bh,
+ (__le32 *)bh->b_data, level - 1,
+ first2, count2,
+ inode->i_sb->s_blocksize >> 2);
+ if (ret) {
+ brelse(bh);
+ goto err;
+ }
+ }
+ if (level == 0 ||
+ (bh && all_zeroes((__le32 *)bh->b_data,
+ (__le32 *)bh->b_data + addr_per_block))) {
+ ext4_free_data(handle, inode, parent_bh,
+ i_data, i_data + 1);
+ }
+ brelse(bh);
+ bh = NULL;
+ }
+
+err:
+ return ret;
+}
+
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
+{
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int level, ret = 0;
+ int num = EXT4_NDIR_BLOCKS;
+ ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+ __le32 *i_data = EXT4_I(inode)->i_data;
+
+ count = stop - first;
+ for (level = 0; level < 4; level++, max *= addr_per_block) {
+ if (first < max) {
+ ret = free_hole_blocks(handle, inode, NULL, i_data,
+ level, first, count, num);
+ if (ret)
+ goto err;
+ if (count > max - first)
+ count -= max - first;
+ else
+ break;
+ first = 0;
+ } else {
+ first -= max;
+ }
+ i_data += num;
+ if (level == 0) {
+ num = 1;
+ max = 1;
+ }
+ }
+
+err:
+ return ret;
+}
+
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 00000000000..645205d8ada
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,2000 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA "data"
+#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_OFFSET 2
+#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_get_inline_size(struct inode *inode)
+{
+ if (EXT4_I(inode)->i_inline_off)
+ return EXT4_I(inode)->i_inline_size;
+
+ return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ int free, min_offs;
+
+ min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE -
+ EXT4_I(inode)->i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header);
+
+ /*
+ * We need to subtract another sizeof(__u32) since an in-inode xattr
+ * needs an empty 4 bytes to indicate the gap between the xattr entry
+ * and the name/value pair.
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return EXT4_XATTR_SIZE(min_offs -
+ EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+ EXT4_XATTR_ROUND - sizeof(__u32));
+
+ raw_inode = ext4_raw_inode(iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /* Compute min_offs. */
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_block && entry->e_value_size) {
+ size_t offs = le16_to_cpu(entry->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs -
+ ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+ if (EXT4_I(inode)->i_inline_off) {
+ entry = (struct ext4_xattr_entry *)
+ ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+ goto out;
+ }
+
+ free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+ if (free > EXT4_XATTR_ROUND)
+ free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+ else
+ free = 0;
+
+out:
+ return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+ int error, max_inline_size;
+ struct ext4_iloc iloc;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error) {
+ ext4_error_inode(inode, __func__, __LINE__, 0,
+ "can't get inode location %lu",
+ inode->i_ino);
+ return 0;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ brelse(iloc.bh);
+
+ if (!max_inline_size)
+ return 0;
+
+ return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+ return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ if (!is.s.not_found) {
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ }
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+ unsigned int len,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ int cp_len = 0;
+ struct ext4_inode *raw_inode;
+
+ if (!len)
+ return 0;
+
+ BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+ cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+ len : EXT4_MIN_INLINE_DATA_SIZE;
+
+ raw_inode = ext4_raw_inode(iloc);
+ memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+
+ if (!len)
+ goto out;
+
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ len = min_t(unsigned int, len,
+ (unsigned int)le32_to_cpu(entry->e_value_size));
+
+ memcpy(buffer,
+ (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+ cp_len += len;
+
+out:
+ return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int cp_len = 0;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+ BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+ raw_inode = ext4_raw_inode(iloc);
+ buffer += pos;
+
+ if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+ cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+ memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+ pos += cp_len;
+ }
+
+ if (!len)
+ return;
+
+ pos -= EXT4_MIN_INLINE_DATA_SIZE;
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+
+ memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+ buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+ struct inode *inode, unsigned len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+ value = EXT4_ZERO_XATTR_VALUE;
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ value = "";
+ len = 0;
+ }
+
+ /* Insert the the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(!is.s.not_found);
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error) {
+ if (error == -ENOSPC)
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ /* If the old space is ok, write the data directly. */
+ if (len <= EXT4_I(inode)->i_inline_size)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(is.s.not_found);
+
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ value = kzalloc(len, GFP_NOFS);
+ if (!value)
+ goto out;
+
+ error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, len);
+ if (error == -ENODATA)
+ goto out;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ /* Update the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ kfree(value);
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int ret, size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return -ENOSPC;
+
+ size = ext4_get_max_inline_size(inode);
+ if (size < len)
+ return -ENOSPC;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+
+ if (ei->i_inline_off)
+ ret = ext4_update_inline_data(handle, inode, len);
+ else
+ ret = ext4_create_inline_data(handle, inode, len);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+ struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = 0, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ .value = NULL,
+ .value_len = 0,
+ };
+ int error;
+
+ if (!ei->i_inline_off)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (S_ISDIR(inode->i_mode) ||
+ S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+ ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+ EXT4_I(inode)->i_inline_off = 0;
+ EXT4_I(inode)->i_inline_size = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+ brelse(is.iloc.bh);
+ if (error == -ENODATA)
+ error = 0;
+ return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+ void *kaddr;
+ int ret = 0;
+ size_t len;
+ struct ext4_iloc iloc;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!ext4_has_inline_data(inode));
+ BUG_ON(page->index);
+
+ if (!EXT4_I(inode)->i_inline_off) {
+ ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+ inode->i_ino);
+ goto out;
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ goto out;
+
+ len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+ kaddr = kmap_atomic(page);
+ ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ brelse(iloc.bh);
+
+out:
+ return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+ int ret = 0;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return -EAGAIN;
+ }
+
+ /*
+ * Current inline data can only exist in the 1st page,
+ * So for all the other pages, just set them uptodate.
+ */
+ if (!page->index)
+ ret = ext4_read_inline_page(inode, page);
+ else if (!PageUptodate(page)) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ unlock_page(page);
+ return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags)
+{
+ int ret, needed_blocks;
+ handle_t *handle = NULL;
+ int retries = 0, sem_held = 0;
+ struct page *page = NULL;
+ unsigned from, to;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ /*
+ * clear the flag so that no new write
+ * will trap here again.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 1;
+ /* If some one has already done this for us, just exit. */
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out;
+ }
+
+ from = 0;
+ to = ext4_get_inline_size(inode);
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ if (ret)
+ goto out;
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, from, to, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, from, to, ext4_get_block);
+
+ if (!ret && ext4_should_journal_data(inode)) {
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
+ }
+
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_orphan_add(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 0;
+ ext4_journal_stop(handle);
+ handle = NULL;
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ block_commit_write(page, from, to);
+out:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (sem_held)
+ up_write(&EXT4_I(inode)->xattr_sem);
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep)
+{
+ int ret;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ if (pos + len > ext4_get_max_inline_size(inode))
+ goto convert;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ /*
+ * The possible write could happen in the inode,
+ * so try to reserve the space in inode first.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ /* We don't have space in inline inode, so convert it to extent. */
+ if (ret == -ENOSPC) {
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ goto convert;
+ }
+
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *pagep = page;
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_up_read;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_up_read;
+ }
+
+ ret = 1;
+ handle = NULL;
+out_up_read:
+ up_read(&EXT4_I(inode)->xattr_sem);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+convert:
+ return ext4_convert_inline_data_to_extent(mapping,
+ inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(page)) {
+ copied = 0;
+ goto out;
+ }
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ copied = 0;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ BUG_ON(!ext4_has_inline_data(inode));
+
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+ brelse(iloc.bh);
+out:
+ return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ return NULL;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+ kunmap_atomic(kaddr);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ * clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ * update and dirty so that ext4_da_writepages can handle it. We don't
+ * need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags,
+ void **fsdata)
+{
+ int ret = 0, inline_size;
+ struct page *page;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page)
+ return -ENOMEM;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __block_write_begin(page, 0, inline_size,
+ ext4_da_get_block_prep);
+ if (ret) {
+ ext4_truncate_failed_write(inode);
+ goto out;
+ }
+
+ SetPageDirty(page);
+ SetPageUptodate(page);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret, inline_size;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+ int retries;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ inline_size = ext4_get_max_inline_size(inode);
+
+ ret = -ENOSPC;
+ if (inline_size >= pos + len) {
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out_journal;
+ }
+
+ if (ret == -ENOSPC) {
+ ret = ext4_da_convert_inline_data_to_extent(mapping,
+ inode,
+ flags,
+ fsdata);
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out;
+ }
+
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_journal;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out_release_page;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_release_page;
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *pagep = page;
+ brelse(iloc.bh);
+ return 1;
+out_release_page:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ unlock_page(page);
+ page_cache_release(page);
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page)
+{
+ int i_size_changed = 0;
+
+ copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
+ return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+ void *inline_start, int inline_size)
+{
+ int offset;
+ unsigned short de_len;
+ struct ext4_dir_entry_2 *de = inline_start;
+ void *dlimit = inline_start + inline_size;
+
+ trace_printk("inode %lu\n", dir->i_ino);
+ offset = 0;
+ while ((void *)de < dlimit) {
+ de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+ trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+ offset, de_len, de->name_len, de->name,
+ de->name_len, le32_to_cpu(de->inode));
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ inline_start, inline_size, offset))
+ BUG();
+
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct dentry *dentry,
+ struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *inline_start, int inline_size)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ int err;
+ struct ext4_dir_entry_2 *de;
+
+ err = ext4_find_dest_de(dir, inode, iloc->bh,
+ inline_start, inline_size,
+ name, namelen, &de);
+ if (err)
+ return err;
+
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err)
+ return err;
+ ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+ header = IHDR(inode, ext4_raw_inode(iloc));
+ entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+ EXT4_I(inode)->i_inline_off);
+
+ return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+ struct ext4_dir_entry_2 *de, *prev_de;
+ void *limit;
+ int de_len;
+
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ if (old_size) {
+ limit = de_buf + old_size;
+ do {
+ prev_de = de;
+ de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+ de_buf += de_len;
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ } while (de_buf < limit);
+
+ prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+ old_size, new_size);
+ } else {
+ /* this is just created, so create an empty entry. */
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+ }
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+ struct ext4_iloc *iloc)
+{
+ int ret;
+ int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+ int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+ if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ return -ENOSPC;
+
+ ret = ext4_update_inline_data(handle, dir,
+ new_size + EXT4_MIN_INLINE_DATA_SIZE);
+ if (ret)
+ return ret;
+
+ ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+ EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE);
+ dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+ return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buf, int inline_size)
+{
+ ext4_create_inline_data(handle, inode, inline_size);
+ ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *dir_block,
+ void *buf,
+ int inline_size)
+{
+ int err, csum_size = 0, header_size = 0;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ void *target = dir_block->b_data;
+
+ /*
+ * First create "." and ".." and then copy the dir information
+ * back to the block.
+ */
+ de = (struct ext4_dir_entry_2 *)target;
+ de = ext4_init_dot_dotdot(inode, de,
+ inode->i_sb->s_blocksize, csum_size,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+ header_size = (void *)de - target;
+
+ memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ ext4_update_final_de(dir_block->b_data,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+ inode->i_sb->s_blocksize - csum_size);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data,
+ inode->i_sb->s_blocksize);
+ initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+ }
+ set_buffer_uptodate(dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int error;
+ void *buf = NULL;
+ struct buffer_head *data_bh = NULL;
+ struct ext4_map_blocks map;
+ int inline_size;
+
+ inline_size = ext4_get_inline_size(inode);
+ buf = kmalloc(inline_size, GFP_NOFS);
+ if (!buf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+ if (error < 0)
+ goto out;
+
+ error = ext4_destroy_inline_data_nolock(handle, inode);
+ if (error)
+ goto out;
+
+ map.m_lblk = 0;
+ map.m_len = 1;
+ map.m_flags = 0;
+ error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+ if (error < 0)
+ goto out_restore;
+ if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!data_bh) {
+ error = -ENOMEM;
+ goto out_restore;
+ }
+
+ lock_buffer(data_bh);
+ error = ext4_journal_get_create_access(handle, data_bh);
+ if (error) {
+ unlock_buffer(data_bh);
+ error = -EIO;
+ goto out_restore;
+ }
+ memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ memcpy(data_bh->b_data, buf, inline_size);
+ set_buffer_uptodate(data_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, data_bh);
+ } else {
+ error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+ buf, inline_size);
+ }
+
+ unlock_buffer(data_bh);
+out_restore:
+ if (error)
+ ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+ brelse(data_bh);
+ kfree(buf);
+ return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ int ret, inline_size;
+ void *inline_start;
+ struct ext4_iloc iloc;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ ret = ext4_get_inode_loc(dir, &iloc);
+ if (ret)
+ return ret;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir))
+ goto out;
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+ if (ret != -ENOSPC)
+ goto out;
+
+ /* check whether it can be inserted to inline xattr space. */
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ if (!inline_size) {
+ /* Try to use the xattr space.*/
+ ret = ext4_update_inline_dir(handle, dir, &iloc);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_size) {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+
+ if (ret != -ENOSPC)
+ goto out;
+ }
+
+ /*
+ * The inline space is filled up, so create a new block for it.
+ * As the extent tree will be created, we have to save the inline
+ * dir first.
+ */
+ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+ ext4_mark_inode_dirty(handle, dir);
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data)
+{
+ int err = 0, count = 0;
+ unsigned int parent_ino;
+ int pos;
+ struct ext4_dir_entry_2 *de;
+ struct inode *inode = file_inode(dir_file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ struct ext4_dir_entry_2 fake;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ pos = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ while (pos < inline_size) {
+ /*
+ * As inlined dir doesn't store any information about '.' and
+ * only the inode number of '..' is stored, we have to handle
+ * them differently.
+ */
+ if (pos == 0) {
+ fake.inode = cpu_to_le32(inode->i_ino);
+ fake.name_len = 1;
+ strcpy(fake.name, ".");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_OFFSET;
+ } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+ fake.inode = cpu_to_le32(parent_ino);
+ fake.name_len = 2;
+ strcpy(fake.name, "..");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+ pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ if (ext4_check_dir_entry(inode, dir_file, de,
+ iloc.bh, dir_buf,
+ inline_size, pos)) {
+ ret = count;
+ goto out;
+ }
+ }
+
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ count++;
+ }
+ ret = count;
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+ struct dir_context *ctx,
+ int *has_inline_data)
+{
+ unsigned int offset, parent_ino;
+ int i;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ struct inode *inode = file_inode(file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ int dotdot_offset, dotdot_size, extra_offset, extra_size;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+ sb = inode->i_sb;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ offset = ctx->pos;
+
+ /*
+ * dotdot_offset and dotdot_size is the real offset and
+ * size for ".." and "." if the dir is block based while
+ * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+ * So we will use extra_offset and extra_size to indicate them
+ * during the inline dir iteration.
+ */
+ dotdot_offset = EXT4_DIR_REC_LEN(1);
+ dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+ extra_size = extra_offset + inline_size;
+
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (file->f_version != inode->i_version) {
+ for (i = 0; i < extra_size && i < offset;) {
+ /*
+ * "." is with offset 0 and
+ * ".." is dotdot_offset.
+ */
+ if (!i) {
+ i = dotdot_offset;
+ continue;
+ } else if (i == dotdot_offset) {
+ i = dotdot_size;
+ continue;
+ }
+ /* for other entry, the real offset in
+ * the buf has to be tuned accordingly.
+ */
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i - extra_offset);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+ < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ extra_size);
+ }
+ offset = i;
+ ctx->pos = offset;
+ file->f_version = inode->i_version;
+ }
+
+ while (ctx->pos < extra_size) {
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_offset;
+ continue;
+ }
+
+ if (ctx->pos == dotdot_offset) {
+ if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_size;
+ continue;
+ }
+
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + ctx->pos - extra_offset);
+ if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+ extra_size, ctx->pos))
+ goto out;
+ if (le32_to_cpu(de->inode)) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto out;
+ }
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
+ }
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval)
+{
+ struct ext4_iloc iloc;
+
+ *retval = ext4_get_inode_loc(inode, &iloc);
+ if (*retval)
+ return NULL;
+
+ *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+ return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+ struct inode *inode)
+{
+ int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ struct ext4_iloc iloc;
+ struct ext4_dir_entry_2 *de;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ ret = ext4_prepare_inline_data(handle, inode, inline_size);
+ if (ret)
+ goto out;
+
+ /*
+ * For inline dir, we only save the inode information for the ".."
+ * and create a fake dentry to cover the left space.
+ */
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ de->inode = cpu_to_le32(parent->i_ino);
+ de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(
+ inline_size - EXT4_INLINE_DOTDOT_SIZE,
+ inline_size);
+ set_nlink(inode, 2);
+ inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data)
+{
+ int ret;
+ struct ext4_iloc iloc;
+ void *inline_start;
+ int inline_size;
+
+ if (ext4_get_inode_loc(dir, &iloc))
+ return NULL;
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+ if (ret < 0)
+ goto out;
+
+ if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+ goto out;
+
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+
+out:
+ brelse(iloc.bh);
+ iloc.bh = NULL;
+out_find:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_start;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err)
+ return err;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+ EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+ EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ inline_start, inline_size, 0);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(err))
+ goto out;
+
+ ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+ struct ext4_iloc *iloc,
+ unsigned int offset,
+ void **inline_start,
+ int *inline_size)
+{
+ void *inline_pos;
+
+ BUG_ON(offset > ext4_get_inline_size(inode));
+
+ if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+ *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+ offset -= EXT4_MIN_INLINE_DATA_SIZE;
+ *inline_size = ext4_get_inline_size(inode) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_start)
+ *inline_start = inline_pos;
+ return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_pos;
+ unsigned int offset;
+ struct ext4_dir_entry_2 *de;
+ int ret = 1;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+ err, dir->i_ino);
+ return 1;
+ }
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ if (!le32_to_cpu(de->inode)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - no `..'",
+ dir->i_ino);
+ ret = 1;
+ goto out;
+ }
+
+ offset = EXT4_INLINE_DOTDOT_SIZE;
+ while (offset < dir->i_size) {
+ de = ext4_get_inline_entry(dir, &iloc, offset,
+ &inline_pos, &inline_size);
+ if (ext4_check_dir_entry(dir, NULL, de,
+ iloc.bh, inline_pos,
+ inline_size, offset)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - "
+ "inode %u, rec_len %u, name_len %d"
+ "inline size %d\n",
+ dir->i_ino, le32_to_cpu(de->inode),
+ le16_to_cpu(de->rec_len), de->name_len,
+ inline_size);
+ ret = 1;
+ goto out;
+ }
+ if (le32_to_cpu(de->inode)) {
+ ret = 0;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ }
+
+out:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ int error = 0;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ goto out;
+ }
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ physical += offsetof(struct ext4_inode, i_block);
+ length = i_size_read(inode);
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ brelse(iloc.bh);
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed)
+{
+ int error;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ if (EXT4_XATTR_LEN(entry->e_name_len) +
+ EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+ brelse(iloc.bh);
+ return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+ handle_t *handle;
+ int inline_size, value_len, needed_blocks;
+ size_t i_size;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
+ if (IS_ERR(handle))
+ return;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ ext4_journal_stop(handle);
+ return;
+ }
+
+ if (ext4_orphan_add(handle, inode))
+ goto out;
+
+ if (ext4_get_inode_loc(inode, &is.iloc))
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = inode->i_size;
+ inline_size = ext4_get_inline_size(inode);
+ EXT4_I(inode)->i_disksize = i_size;
+
+ if (i_size < inline_size) {
+ /* Clear the content in the xattr space. */
+ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+ if (ext4_xattr_ibody_find(inode, &i, &is))
+ goto out_error;
+
+ BUG_ON(is.s.not_found);
+
+ value_len = le32_to_cpu(is.s.here->e_value_size);
+ value = kmalloc(value_len, GFP_NOFS);
+ if (!value)
+ goto out_error;
+
+ if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, value_len))
+ goto out_error;
+
+ i.value = value;
+ i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+ i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+ if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+ goto out_error;
+ }
+
+ /* Clear the content within i_blocks. */
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+ void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+ memset(p + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ }
+
+ EXT4_I(inode)->i_inline_size = i_size <
+ EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE : i_size;
+ }
+
+out_error:
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ brelse(is.iloc.bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ kfree(value);
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+ return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+ int error, needed_blocks;
+ handle_t *handle;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ iloc.bh = NULL;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_free;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+ up_write(&EXT4_I(inode)->xattr_sem);
+out:
+ ext4_journal_stop(handle);
+out_free:
+ brelse(iloc.bh);
+ return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bdbe6990220..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,17 +12,12 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*
* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -39,110 +34,124 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/printk.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/aio.h>
+#include <linux/bitops.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
+#include "truncate.h"
#include <trace/events/ext4.h>
#define MPAGE_DA_EXTENT_TAIL 0x01
-static inline int ext4_begin_ordered_truncate(struct inode *inode,
- loff_t new_size)
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- trace_ext4_begin_ordered_truncate(inode, new_size);
- return jbd2_journal_begin_ordered_truncate(
- EXT4_SB(inode->i_sb)->s_journal,
- &EXT4_I(inode)->jinode,
- new_size);
-}
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u16 csum_lo;
+ __u16 csum_hi = 0;
+ __u32 csum;
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
+ raw->i_checksum_lo = 0;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
+ raw->i_checksum_hi = 0;
+ }
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
-{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+ EXT4_INODE_SIZE(inode->i_sb));
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
+
+ return csum;
}
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- ext4_lblk_t needed;
-
- needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+ __u32 provided, calculated;
- /* Give ourselves just enough room to cope with inodes in which
- * i_blocks is corrupt: we've seen disk corruptions in the past
- * which resulted in random data in an inode which looked enough
- * like a regular file for ext4 to try to delete it. Things
- * will go a bit crazy if that happens, but at least we should
- * try not to panic the whole kernel. */
- if (needed < 2)
- needed = 2;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
- /* But we need to bound the transaction so we don't overflow the
- * journal. */
- if (needed > EXT4_MAX_TRANS_DATA)
- needed = EXT4_MAX_TRANS_DATA;
+ provided = le16_to_cpu(raw->i_checksum_lo);
+ calculated = ext4_inode_csum(inode, raw, ei);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+ else
+ calculated &= 0xFFFF;
- return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+ return provided == calculated;
}
-/*
- * Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- handle_t *result;
+ __u32 csum;
- result = ext4_journal_start(inode, blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
+ csum = ext4_inode_csum(inode, raw, ei);
+ raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ trace_ext4_begin_ordered_truncate(inode, new_size);
+ /*
+ * If jinode is zero, then we never opened the file for
+ * writing, so there's no need to call
+ * jbd2_journal_begin_ordered_truncate() since there's no
+ * outstanding writes we need to flush.
+ */
+ if (!EXT4_I(inode)->jinode)
+ return 0;
+ return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
+
/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * Test whether an inode is a fast symlink.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
return 0;
- return 1;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
/*
@@ -164,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ ret = ext4_journal_restart(handle, nblocks);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -180,8 +189,38 @@ void ext4_evict_inode(struct inode *inode)
int err;
trace_ext4_evict_inode(inode);
+
if (inode->i_nlink) {
- truncate_inode_pages(&inode->i_data, 0);
+ /*
+ * When journalling data dirty buffers are tracked only in the
+ * journal. So although mm thinks everything is clean and
+ * ready for reaping the inode might still have some pages to
+ * write in the running transaction or waiting to be
+ * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * (via truncate_inode_pages()) to discard these buffers can
+ * cause data loss. Also even if we did not discard these
+ * buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to
+ * read them before the transaction is checkpointed. So be
+ * careful and force everything to disk here... We use
+ * ei->i_datasync_tid to store the newest transaction
+ * containing inode's data.
+ *
+ * Note that directories do not have this problem because they
+ * don't use page cache.
+ */
+ if (ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT4_JOURNAL_INO) {
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+ jbd2_complete_transaction(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -190,12 +229,19 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
- handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -204,6 +250,7 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -235,6 +282,7 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
}
@@ -263,786 +311,12 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
-typedef struct {
- __le32 *p;
- __le32 key;
- struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-/**
- * ext4_block_to_path - parse the block number into array of offsets
- * @inode: inode in question (we are only interested in its superblock)
- * @i_block: block number to be parsed
- * @offsets: array to store the offsets in
- * @boundary: set this non-zero if the referred-to block is likely to be
- * followed (on disk) by an indirect block.
- *
- * To store the locations of file's data ext4 uses a data structure common
- * for UNIX filesystems - tree of pointers anchored in the inode, with
- * data blocks at leaves and indirect blocks in intermediate nodes.
- * This function translates the block number into path in that tree -
- * return value is the path length and @offsets[n] is the offset of
- * pointer to (n+1)th node in the nth one. If @block is out of range
- * (negative or too large) warning is printed and zero returned.
- *
- * Note: function doesn't find node addresses, so no IO is needed. All
- * we need to know is the capacity of indirect blocks (taken from the
- * inode->i_sb).
- */
-
-/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
- */
-
-static int ext4_block_to_path(struct inode *inode,
- ext4_lblk_t i_block,
- ext4_lblk_t offsets[4], int *boundary)
-{
- int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
- const long direct_blocks = EXT4_NDIR_BLOCKS,
- indirect_blocks = ptrs,
- double_blocks = (1 << (ptrs_bits * 2));
- int n = 0;
- int final = 0;
-
- if (i_block < direct_blocks) {
- offsets[n++] = i_block;
- final = direct_blocks;
- } else if ((i_block -= direct_blocks) < indirect_blocks) {
- offsets[n++] = EXT4_IND_BLOCK;
- offsets[n++] = i_block;
- final = ptrs;
- } else if ((i_block -= indirect_blocks) < double_blocks) {
- offsets[n++] = EXT4_DIND_BLOCK;
- offsets[n++] = i_block >> ptrs_bits;
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
- offsets[n++] = EXT4_TIND_BLOCK;
- offsets[n++] = i_block >> (ptrs_bits * 2);
- offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else {
- ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
- i_block + direct_blocks +
- indirect_blocks + double_blocks, inode->i_ino);
- }
- if (boundary)
- *boundary = final - 1 - (i_block & (ptrs - 1));
- return n;
-}
-
-static int __ext4_check_blockref(const char *function, unsigned int line,
- struct inode *inode,
- __le32 *p, unsigned int max)
-{
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- __le32 *bref = p;
- unsigned int blk;
-
- while (bref < p+max) {
- blk = le32_to_cpu(*bref++);
- if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
- es->s_last_error_block = cpu_to_le64(blk);
- ext4_error_inode(inode, function, line, blk,
- "invalid block");
- return -EIO;
- }
- }
- return 0;
-}
-
-
-#define ext4_check_indirect_blockref(inode, bh) \
- __ext4_check_blockref(__func__, __LINE__, inode, \
- (__le32 *)(bh)->b_data, \
- EXT4_ADDR_PER_BLOCK((inode)->i_sb))
-
-#define ext4_check_inode_blockref(inode) \
- __ext4_check_blockref(__func__, __LINE__, inode, \
- EXT4_I(inode)->i_data, \
- EXT4_NDIR_BLOCKS)
-
-/**
- * ext4_get_branch - read the chain of indirect blocks leading to data
- * @inode: inode in question
- * @depth: depth of the chain (1 - direct pointer, etc.)
- * @offsets: offsets of pointers in inode/indirect blocks
- * @chain: place to store the result
- * @err: here we store the error value
- *
- * Function fills the array of triples <key, p, bh> and returns %NULL
- * if everything went OK or the pointer to the last filled triple
- * (incomplete one) otherwise. Upon the return chain[i].key contains
- * the number of (i+1)-th block in the chain (as it is stored in memory,
- * i.e. little-endian 32-bit), chain[i].p contains the address of that
- * number (it points into struct inode for i==0 and into the bh->b_data
- * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- * block for i>0 and NULL for i==0. In other words, it holds the block
- * numbers of the chain, addresses they were taken from (and where we can
- * verify that chain did not change) and buffer_heads hosting these
- * numbers.
- *
- * Function stops when it stumbles upon zero pointer (absent block)
- * (pointer to last triple returned, *@err == 0)
- * or when it gets an IO error reading an indirect block
- * (ditto, *@err == -EIO)
- * or when it reads all @depth-1 indirect blocks successfully and finds
- * the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
- ext4_lblk_t *offsets,
- Indirect chain[4], int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- /* i_data is not going away, no lock needed */
- add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
- if (unlikely(!bh))
- goto failure;
-
- if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
- goto failure;
- }
- /* validate block references */
- if (ext4_check_indirect_blockref(inode, bh)) {
- put_bh(bh);
- goto failure;
- }
- }
-
- add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
- /* Reader: end */
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-/**
- * ext4_find_near - find a place for allocation with sufficient locality
- * @inode: owner
- * @ind: descriptor of indirect block.
- *
- * This function returns the preferred place for block allocation.
- * It is used when heuristic for sequential allocation fails.
- * Rules are:
- * + if there is a block to the left of our position - allocate near it.
- * + if pointer will live in indirect block - allocate near that block.
- * + if pointer will live in inode - allocate in the same
- * cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group. The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- * Caller must make sure that @ind is valid and will stay that way.
- */
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
- __le32 *p;
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- ext4_group_t block_group;
- int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
-
- /* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--) {
- if (*p)
- return le32_to_cpu(*p);
- }
-
- /* No such thing, so let's try location of indirect block */
- if (ind->bh)
- return ind->bh->b_blocknr;
-
- /*
- * It is going to be referred to from the inode itself? OK, just put it
- * into the same cylinder group then.
- */
- block_group = ei->i_block_group;
- if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
- block_group &= ~(flex_size-1);
- if (S_ISREG(inode->i_mode))
- block_group++;
- }
- bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- /*
- * If we are doing delayed allocation, we don't need take
- * colour into account.
- */
- if (test_opt(inode->i_sb, DELALLOC))
- return bg_start;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour;
-}
-
-/**
- * ext4_find_goal - find a preferred place for allocation.
- * @inode: owner
- * @block: block we want
- * @partial: pointer to the last triple within a chain
- *
- * Normally this function find the preferred place for block allocation,
- * returns it.
- * Because this is only used for non-extent files, we limit the block nr
- * to 32 bits.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
- Indirect *partial)
-{
- ext4_fsblk_t goal;
-
- /*
- * XXX need to get goal block from mballoc's data structures
- */
-
- goal = ext4_find_near(inode, partial);
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- return goal;
-}
-
-/**
- * ext4_blks_to_allocate: Look up the block map and count the number
- * of direct blocks need to be allocated for the given branch.
- *
- * @branch: chain of indirect blocks
- * @k: number of blocks need for indirect blocks
- * @blks: number of data blocks to be mapped.
- * @blocks_to_boundary: the offset in the indirect block
- *
- * return the total number of blocks to be allocate, including the
- * direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
- int blocks_to_boundary)
-{
- unsigned int count = 0;
-
- /*
- * Simple case, [t,d]Indirect block(s) has not allocated yet
- * then it's clear blocks on that path have not allocated
- */
- if (k > 0) {
- /* right now we don't handle cross boundary allocation */
- if (blks < blocks_to_boundary + 1)
- count += blks;
- else
- count += blocks_to_boundary + 1;
- return count;
- }
-
- count++;
- while (count < blks && count <= blocks_to_boundary &&
- le32_to_cpu(*(branch[0].p + count)) == 0) {
- count++;
- }
- return count;
-}
-
-/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- *
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @blks: on return it will store the total number of allocated
- * direct blocks
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- struct ext4_allocation_request ar;
- int target, i;
- unsigned long count = 0, blk_allocated = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
-
- /*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
- */
- /* first we try to allocate the indirect blocks */
- target = indirect_blks;
- while (target > 0) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode,
- goal, &count, err);
- if (*err)
- goto failed_out;
-
- if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + count %lu > %d!",
- current_block, count,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
- if (count > 0) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- printk(KERN_INFO "%s returned more blocks than "
- "requested\n", __func__);
- WARN_ON(1);
- break;
- }
- }
-
- target = blks - count ;
- blk_allocated = count;
- if (!target)
- goto allocated;
- /* Now allocate data blocks */
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = target;
- ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
- /* enable in-core preallocation only for regular files */
- ar.flags = EXT4_MB_HINT_DATA;
-
- current_block = ext4_mb_new_blocks(handle, &ar, err);
- if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + ar.len %d > %d!",
- current_block, ar.len,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- if (*err && (target == blks)) {
- /*
- * if the allocation failed and we didn't allocate
- * any blocks before
- */
- goto failed_out;
- }
- if (!*err) {
- if (target == blks) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- }
- blk_allocated += ar.len;
- }
-allocated:
- /* total number of blocks allocated for direct blocks */
- ret = blk_allocated;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
- return ret;
-}
-
-/**
- * ext4_alloc_branch - allocate and set up a chain of blocks.
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
- *
- * This function allocates blocks, zeroes out all but the last one,
- * links them into chain and (if we are synchronous) writes them to disk.
- * In other words, it prepares a branch that can be spliced onto the
- * inode. It stores the information about that chain in the branch[], in
- * the same format as ext4_get_branch() would do. We are calling it after
- * we had read the existing part of chain and partial points to the last
- * triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext4_get_block(), except that in one
- * place chain is disconnected - *branch->p is still zero (we did not
- * set the last link), but branch->key contains the number that should
- * be placed into *branch->p to fill that gap.
- *
- * If allocation fails we free all blocks we've allocated (and forget
- * their buffer_heads) and return the error value the from failed
- * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- * as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, int indirect_blks,
- int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
-
- num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
- /*
- * metadata blocks and data blocks are allocated.
- */
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
- if (unlikely(!bh)) {
- err = -EIO;
- goto failed;
- }
-
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
- if (err) {
- /* Don't brelse(bh) here; it's done in
- * ext4_journal_forget() below */
- unlock_buffer(bh);
- goto failed;
- }
-
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if (n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i = 1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
-
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (err)
- goto failed;
- }
- *blks = num;
- return err;
-failed:
- /* Allocation failed, free what we already allocated */
- ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
- for (i = 1; i <= n ; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
- EXT4_FREE_BLOCKS_FORGET);
- }
- for (i = n+1; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
-
- return err;
-}
-
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext4_alloc_branch)
- * @where: location of missing link
- * @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num,
- int blks)
-{
- int i;
- int err = 0;
- ext4_fsblk_t current_block;
-
- /*
- * If we're splicing into a [td]indirect block (as opposed to the
- * inode) then we need to get write access to the [td]indirect block
- * before the splice.
- */
- if (where->bh) {
- BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
- if (err)
- goto err_out;
- }
- /* That's it */
-
- *where->p = where->key;
-
- /*
- * Update the host buffer_head or inode to point to more just allocated
- * direct blocks blocks
- */
- if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
- *(where->p + i) = cpu_to_le32(current_block++);
- }
-
- /* We are done with atomic stuff, now do the rest of housekeeping */
- /* had we spliced it onto indirect block? */
- if (where->bh) {
- /*
- * If we spliced it onto an indirect block, we haven't
- * altered the inode. Note however that if it is being spliced
- * onto an indirect block at the very end of the file (the
- * file is growing) then we *will* alter the inode to reflect
- * the new i_size. But that is not done here - it is done in
- * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
- */
- jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, where->bh);
- if (err)
- goto err_out;
- } else {
- /*
- * OK, we spliced it into the inode itself on a direct block.
- */
- ext4_mark_inode_dirty(handle, inode);
- jbd_debug(5, "splicing direct\n");
- }
- return err;
-
-err_out:
- for (i = 1; i <= num; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
- EXT4_FREE_BLOCKS_FORGET);
- }
- ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
- blks, 0);
-
- return err;
-}
-
-/*
- * The ext4_ind_map_blocks() function handles non-extents inodes
- * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_map_blocks().
- *
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- * The ext4_ind_get_blocks() function should be called with
- * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
- * blocks.
- */
-static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map,
- int flags)
-{
- int err = -EIO;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- ext4_fsblk_t goal;
- int indirect_blks;
- int blocks_to_boundary = 0;
- int depth;
- int count = 0;
- ext4_fsblk_t first_block = 0;
-
- J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
- J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
- depth = ext4_block_to_path(inode, map->m_lblk, offsets,
- &blocks_to_boundary);
-
- if (depth == 0)
- goto out;
-
- partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
- first_block = le32_to_cpu(chain[depth - 1].key);
- count++;
- /*map more blocks*/
- while (count < map->m_len && count <= blocks_to_boundary) {
- ext4_fsblk_t blk;
-
- blk = le32_to_cpu(*(chain[depth-1].p + count));
-
- if (blk == first_block + count)
- count++;
- else
- break;
- }
- goto got_it;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
- goto cleanup;
-
- /*
- * Okay, we need to do block allocation.
- */
- goal = ext4_find_goal(inode, map->m_lblk, partial);
-
- /* the number of blocks need to allocate for [d,t]indirect blocks */
- indirect_blks = (chain + depth) - partial - 1;
-
- /*
- * Next look up the indirect map to count the totoal number of
- * direct blocks to allocate for this branch.
- */
- count = ext4_blks_to_allocate(partial, indirect_blks,
- map->m_len, blocks_to_boundary);
- /*
- * Block out ext4_truncate while we alter the tree
- */
- err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
- &count, goal,
- offsets + (partial - chain), partial);
-
- /*
- * The ext4_splice_branch call will free and forget any buffers
- * on the new chain if there is a failure, but that risks using
- * up transaction credits, especially for bitmaps where the
- * credits cannot be returned. Can we handle this somehow? We
- * may need to return -EAGAIN upwards in the worst case. --sct
- */
- if (!err)
- err = ext4_splice_branch(handle, inode, map->m_lblk,
- partial, indirect_blks, count);
- if (err)
- goto cleanup;
-
- map->m_flags |= EXT4_MAP_NEW;
-
- ext4_update_inode_fsync_trans(handle, inode, 1);
-got_it:
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = le32_to_cpu(chain[depth-1].key);
- map->m_len = count;
- if (count > blocks_to_boundary)
- map->m_flags |= EXT4_MAP_BOUNDARY;
- err = count;
- /* Clean up and exit */
- partial = chain + depth - 1; /* the whole chain */
-cleanup:
- while (partial > chain) {
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
-out:
- return err;
-}
-
#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
@@ -1052,41 +326,14 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode,
- sector_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
- int blk_bits;
-
- if (lblock < EXT4_NDIR_BLOCKS)
- return 0;
-
- lblock -= EXT4_NDIR_BLOCKS;
-
- if (ei->i_da_metadata_calc_len &&
- (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
- ei->i_da_metadata_calc_len++;
- return 0;
- }
- ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
- ei->i_da_metadata_calc_len = 1;
- blk_bits = order_base_2(lblock);
- return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-
-/*
- * Calculate the number of metadata blocks need to reserve
* to allocate a block located at @lblock
*/
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_calc_metadata_amount(inode, lblock);
- return ext4_indirect_calc_metadata_amount(inode, lblock);
+ return ext4_ind_calc_metadata_amount(inode, lblock);
}
/*
@@ -1100,20 +347,31 @@ void ext4_da_update_reserve_space(struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
spin_lock(&ei->i_block_reservation_lock);
- trace_ext4_da_update_reserve_space(inode, used);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
- "with only %d reserved data blocks\n",
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
WARN_ON(1);
used = ei->i_reserved_data_blocks;
}
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+ }
+
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
used + ei->i_allocated_meta_blocks);
ei->i_allocated_meta_blocks = 0;
@@ -1123,7 +381,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* only when we have written all of the delayed
* allocation blocks.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
@@ -1132,14 +390,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update quota subsystem for data blocks */
if (quota_claim)
- dquot_claim_block(inode, used);
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
else {
/*
* We did fallocate with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
* not re-claim the quota for fallocated blocks.
*/
- dquot_release_reservation_block(inode, used);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
/*
@@ -1170,65 +428,57 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *es_map,
+ struct ext4_map_blocks *map,
+ int flags)
{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
+ int retval;
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
+ map->m_flags = 0;
+ /*
+ * There is a race window that the result is not the same.
+ * e.g. xfstests #223 when dioread_nolock enables. The reason
+ * is that we lookup a block mapping in extent status tree with
+ * out taking i_data_sem. So at the time the unwritten extent
+ * could be converted.
+ */
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+ /*
+ * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+ * because it shouldn't be marked in es_map->m_flags.
+ */
+ map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
+ /*
+ * We don't check m_len because extent will be collpased in status
+ * tree. So the m_len might not equal.
+ */
+ if (es_map->m_lblk != map->m_lblk ||
+ es_map->m_flags != map->m_flags ||
+ es_map->m_pblk != map->m_pblk) {
+ printk("ES cache assertion failed for inode: %lu "
+ "es_cached ex [%d/%d/%llu/%x] != "
+ "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+ inode->i_ino, es_map->m_lblk, es_map->m_len,
+ es_map->m_pblk, es_map->m_flags, map->m_lblk,
+ map->m_len, map->m_pblk, map->m_flags,
+ retval, flags);
}
- return num;
}
+#endif /* ES_AGGRESSIVE_TEST */
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
@@ -1242,39 +492,108 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
*
* It returns the error in case of allocation failure.
*/
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
+ int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
map->m_flags = 0;
ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+
+ /*
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
+ */
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(handle, inode, map,
+ &orig_map, flags);
+#endif
+ goto found;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -1287,31 +606,31 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns th create = 0
+ * ext4_ext_get_block() returns the create = 0
* with buffer head unmapped.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- return retval;
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
- * When we call get_blocks without the create flag, the
- * BH_Unwritten flag could have gotten set if the blocks
- * requested were part of a uninitialized extent. We need to
- * clear this flag now that we are committed to convert all or
- * part of the uninitialized extent to be an initialized
- * extent. This is because we need to avoid the combination
- * of BH_Unwritten and BH_Mapped flags being simultaneously
- * set on the buffer_head.
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
*/
- map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+ map->m_flags &= ~EXT4_MAP_FLAGS;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
/*
* if the caller is from delayed allocation writeout path
@@ -1320,7 +639,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* avoid double accounting
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+ ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -1350,11 +669,44 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_da_update_reserve_space(inode, retval, 1);
}
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+ if (retval > 0) {
+ unsigned int status;
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es))
+ goto has_zeroout;
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+
+has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -1372,15 +724,19 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
int ret = 0, started = 0;
int dio_credits;
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, dio_credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
return ret;
@@ -1390,8 +746,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -1424,15 +784,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
- *errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
return NULL;
}
if (map.m_flags & EXT4_MAP_NEW) {
@@ -1479,7 +843,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1488,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers(handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)(handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -1526,11 +890,10 @@ static int walk_page_buffers(handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -1544,8 +907,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
int ret;
@@ -1562,23 +925,14 @@ static int do_journal_get_write_access(handle_t *handle,
*/
if (dirty)
clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
ret = ext4_journal_get_write_access(handle, bh);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
return ret;
}
-/*
- * Truncate blocks that were not used by write. We have to truncate the
- * pagecache as well so that corresponding buffers get properly unmapped.
- */
-static void ext4_truncate_failed_write(struct inode *inode)
-{
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ext4_truncate(inode);
-}
-
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1602,24 +956,45 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
-retry:
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
-
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
+ }
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -1627,13 +1002,13 @@ retry:
ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
unlock_page(page);
- page_cache_release(page);
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -1657,37 +1032,70 @@ retry:
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_handle_dirty_metadata(handle, NULL, bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int i_size_changed = 0;
- struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ int i_size_changed = 0;
+
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
+ * cannot change under us because we hole i_mutex.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1697,10 +1105,10 @@ static int ext4_generic_write_end(struct file *file,
i_size_changed = 1;
}
- if (pos + copied > EXT4_I(inode)->i_disksize) {
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
+ * but greater than i_disksize. (hint delalloc)
*/
ext4_update_i_disksize(inode, (pos + copied));
i_size_changed = 1;
@@ -1717,83 +1125,13 @@ static int ext4_generic_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_ordered_write_end(inode, pos, len, copied);
- ret = ext4_jbd2_file_inode(handle, inode);
-
- if (ret == 0) {
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
- /* if we have allocated more blocks and copied
- * less. We will have blocks allocated outside
- * inode->i_size. So truncate them
- */
- ext4_orphan_add(handle, inode);
- if (ret2 < 0)
- ret = ret2;
- }
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
-
- if (pos + len > inode->i_size) {
- ext4_truncate_failed_write(inode);
- /*
- * If truncate failed early the inode might still be
- * on the orphan list; we need to make sure the inode
- * is removed from the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
-
-
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_writeback_write_end(inode, pos, len, copied);
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-
- if (ret2 < 0)
- ret = ret2;
-
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1828,20 +1166,28 @@ static int ext4_journalled_write_end(struct file *file,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
+
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1876,48 +1222,96 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve a single block located at lblock
+ * Reserve a metadata for a single block located at lblock
*/
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long md_needed;
- int ret;
+ unsigned int md_needed;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
/*
* recalculate the amount of metadata blocks to reserve
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
- md_needed = ext4_calc_metadata_amount(inode, lblock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+ ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
/*
* We will charge metadata quota at writeout time; this saves
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, 1);
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
if (ret)
return ret;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
/*
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
- if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- dquot_release_reservation_block(inode, 1);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- yield();
- goto repeat;
- }
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
- spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks++;
ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
@@ -1943,9 +1337,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* function is called from invalidate page, it's
* harmless to return without any action.
*/
- ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
"ino %lu, to_free %d with only %d reserved "
- "data blocks\n", inode->i_ino, to_free,
+ "data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
WARN_ON(1);
to_free = ei->i_reserved_data_blocks;
@@ -1957,195 +1351,94 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* We can release all of the reserved metadata blocks
* only when we have written all of the delayed
* allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
}
/* update fs dirty data blocks counter */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- dquot_release_reservation_block(inode, to_free);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
+ int num_clusters;
+ ext4_fsblk_t lblk;
+
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
- ext4_da_release_space(page->mapping->host, to_release);
+
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
}
/*
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- int journal_data = ext4_should_journal_data(inode);
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, redirty_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- /*
- * If the page does not have buffers (for
- * whatever reason), try to create them using
- * __block_write_begin. If this fails,
- * redirty the page and move on.
- */
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(mpd->wbc,
- page);
- unlock_page(page);
- continue;
- }
- commit_write = 1;
- }
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (!bh)
- goto redirty_page;
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /* redirty page if block allocation undone */
- if (buffer_delay(bh) || buffer_unwritten(bh))
- redirty_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (redirty_page)
- goto redirty_page;
-
- if (commit_write)
- /* mark the buffer_heads as dirty & uptodate */
- block_commit_write(page, 0, len);
-
- /*
- * Delalloc doesn't support data journalling,
- * but eventually maybe we'll lift this
- * restriction.
- */
- if (unlikely(journal_data && PageChecked(page)))
- err = __ext4_journalled_writepage(page, len);
- else
- err = ext4_bio_write_page(&io_submit, page,
- len, mpd->wbc);
-
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
- sector_t logical, long blk_cnt)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
@@ -2153,9 +1446,20 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (logical + blk_cnt - 1) >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
+
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
+
+ pagevec_init(&pvec, 0);
while (index <= end) {
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
@@ -2166,350 +1470,204 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_CRIT "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_CRIT "Free/Dirty block details\n");
- printk(KERN_CRIT "free_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_CRIT "dirty_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_CRIT "Block reservation details\n");
- printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(sb)));
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ ei->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
return;
}
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+ return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
*/
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
+ struct extent_status es;
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the magic i_delalloc_reserved_flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will call
- * ext4_writepage() for all of the pages which will
- * just redirty the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
- if (err == -ENOSPC &&
- ext4_count_free_blocks(sb)) {
- mpd->retval = err;
- goto submit_io;
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto add_delayed;
}
/*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
*/
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost\n");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
}
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd, next,
- mpd->b_size >> mpd->inode->i_blkbits);
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err)
- /* This only happens if the journal is aborted */
- return;
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, size_t b_size,
- unsigned long b_state)
-{
- sector_t next;
- int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
- goto flush_it;
-
- /* check if thereserved journal credits might overflow */
- if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- EXT4_MAX_TRANS_DATA) {
- /*
- * Adding the new buffer_head would make it cross the
- * allowed limit for which we have journal credit
- * reserved. So limit the new bh->b_size
- */
- b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- mpd->inode->i_blkbits;
- /* we will do mpage_da_submit_io in the next loop */
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = b_size;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += b_size;
- return;
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+ return retval;
}
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
- return;
-}
-
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
-{
- return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-}
-
-/*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd)
-{
- struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head;
- sector_t logical;
-
/*
- * Can we merge this page to current extent?
+ * Try to see if we can get the block without requesting a new
+ * file system block.
*/
- if (mpd->next_page != page->index) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_has_inline_data(inode)) {
/*
- * Nope, we can't. So, we map non-allocated blocks
- * and start IO on them
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
*/
- if (mpd->next_page != mpd->first_page) {
- mpage_da_map_and_submit(mpd);
- /*
- * skip rest of the page in the page_vec
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
+ if (retval == 0) {
+ int ret;
/*
- * Start next extent of pages ...
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
*/
- mpd->first_page = page->index;
-
/*
- * ... and blocks
+ * If the block was allocated from previously allocated cluster,
+ * then we don't need to reserve it again. However we still need
+ * to reserve metadata for every block we're going to write.
*/
- mpd->b_size = 0;
- mpd->b_state = 0;
- mpd->b_blocknr = 0;
- }
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = ext4_da_reserve_metadata(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ }
- mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
+ goto out_unlock;
+ }
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- if (mpd->io_done)
- return MPAGE_DA_EXTENT_TAIL;
- } else {
- /*
- * Page with regular buffer heads, just add all dirty ones
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
*/
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- if (mpd->io_done)
- return MPAGE_DA_EXTENT_TAIL;
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need to update
- * the b_state because we look at
- * b_state in mpage_da_map_blocks. We don't
- * update b_size because if we find an
- * unmapped buffer_head later we need to
- * use the b_state flag of that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
}
- return 0;
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
}
/*
@@ -2524,15 +1682,11 @@ static int __mpage_da_writepage(struct page *page,
* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
* initialized properly.
*/
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
int ret = 0;
- sector_t invalid_block = ~((sector_t) 0xffff);
-
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -2545,25 +1699,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
return ret;
- if (ret == 0) {
- if (buffer_delay(bh))
- return 0; /* Not sure this could or should happen */
- /*
- * XXX: __block_write_begin() unmaps passed block, is it OK?
- */
- ret = ext4_da_reserve_space(inode, iblock);
- if (ret)
- /* not enough space to reserve */
- return ret;
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- return 0;
- }
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -2581,27 +1719,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks. It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling block_write_full_page(). Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
get_bh(bh);
@@ -2619,59 +1736,85 @@ static int __ext4_journalled_writepage(struct page *page,
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
ClearPageChecked(page);
- page_bufs = page_buffers(page);
- BUG_ON(!page_bufs);
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
/* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */
unlock_page(page);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
- ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
- err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
if (ret == 0)
ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ brelse(inode_bh);
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
* need to file the inode to the transaction's list in ordered mode because if
* we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
* transaction the data will hit the disk. In case we are journaling data, we
* cannot start transaction directly because transaction start ranks above page
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via pdflush (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
*
* We don't do any block allocation in this function. If we have page with
@@ -2682,7 +1825,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
* a[0] = 'a';
* truncate(f, 4096);
* we have in the page first buffer_head mapped via page_mkwrite call back
- * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
* do_wp_page). So writepage should write the first block. If we modify
* the mmap area beyond 1024 we will again get a page_fault and the
* page_mkwrite callback will do the block allocation and mark the
@@ -2702,48 +1845,45 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
- int ret = 0, commit_write = 0;
+ int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
- trace_ext4_writepage(inode, page);
+ trace_ext4_writepage(page);
size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
+ page_bufs = page_buffers(page);
/*
- * If the page does not have buffers (for whatever reason),
- * try to create them using __block_write_begin. If this
- * fails, redirty the page and move on.
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
*/
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(wbc, page);
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
unlock_page(page);
return 0;
}
- commit_write = 1;
+ keep_towrite = true;
}
- page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation, so redirty
- * the page and return. We may reach here when we do
- * a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list
- */
- goto redirty_page;
- }
- if (commit_write)
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
@@ -2752,82 +1892,456 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
-
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+ ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
+ *
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
+ }
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ int err;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
+ }
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Up to 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
+ pagevec_release(&pvec);
+ if (err > 0)
+ err = 0;
+ return err;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err, dioread_nolock;
+
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an unwritten extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
- * the pages).
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
*
- * This is a forked version of write_cache_pages(). Differences:
- * Range cyclic is ignored.
- * no_nrwrite_index_update is always presumed true
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- int ret = 0;
- int done = 0;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
+
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ do {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ } while (map->m_len);
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
struct pagevec pvec;
- unsigned nr_pages;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
- long nr_to_write = wbc->nr_to_write;
+ unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
- while (!done && (index <= end)) {
- int i;
-
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
+ while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- break;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2839,102 +2353,91 @@ static int write_cache_pages_da(struct address_space *mapping,
* mapping. However, page->index will not change
* because we have a reference on the page.
*/
- if (page->index > end) {
- done = 1;
- break;
- }
+ if (page->index > end)
+ goto out;
- *done_index = page->index + 1;
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
- lock_page(page);
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
+ lock_page(page);
/*
- * Page truncated or invalidated. We can freely skip it
- * then, even for data integrity operations: the page
- * has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
- * even if there is now a new, dirty page at the same
- * pagecache address.
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
- if (unlikely(page->mapping != mapping)) {
-continue_unlock:
+ if (!PageDirty(page) ||
+ (PageWriteback(page) &&
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+ unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
}
- if (!PageDirty(page)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
-
- if (PageWriteback(page)) {
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
- else
- goto continue_unlock;
- }
-
+ wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- if (!clear_page_dirty_for_io(page))
- goto continue_unlock;
-
- ret = __mpage_da_writepage(page, wbc, mpd);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
- ret = 0;
- } else {
- done = 1;
- break;
- }
- }
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
- }
+ if (mpd->map.m_len == 0)
+ mpd->first_page = page->index;
+ mpd->next_page = page->index + 1;
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ left--;
}
pagevec_release(&pvec);
cond_resched();
}
- return ret;
+ return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- long pages_skipped;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
+ struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2942,7 +2445,16 @@ static int ext4_da_writepages(struct address_space *mapping,
* because that could violate lock ordering on umount
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
+ goto out_writepages;
+
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ goto out_writepages;
+ }
/*
* If the filesystem has aborted, it is read-only, so return
@@ -2950,183 +2462,157 @@ static int ext4_da_writepages(struct address_space *mapping,
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
- return -EROFS;
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
+ }
+
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
mpd.wbc = wbc;
- mpd.inode = mapping->host;
-
- pages_skipped = wbc->pages_skipped;
-
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
- tag_pages_for_writeback(mapping, index, end);
-
- while (!ret && wbc->nr_to_write > 0) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
+ blk_start_plug(&plug);
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call __mpage_da_writepage to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4. We don't actually
- * submit the blocks for I/O here, even though
- * write_cache_pages thinks it will, and will set the
- * pages as clean for write before calling
- * __mpage_da_writepage().
- */
- mpd.b_size = 0;
- mpd.b_state = 0;
- mpd.b_blocknr = 0;
- mpd.first_page = 0;
- mpd.next_page = 0;
- mpd.io_done = 0;
- mpd.pages_written = 0;
- mpd.retval = 0;
- ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
- wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * got one extent now try with
- * rest of the pages
- */
- pages_written += mpd.pages_written;
- wbc->pages_skipped = pages_skipped;
- ret = 0;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
- if (!io_done && !cycled) {
+ blk_finish_plug(&plug);
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
- if (pages_skipped != wbc->pages_skipped)
- ext4_msg(inode->i_sb, KERN_CRIT,
- "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d",
- __func__, wbc->nr_to_write, ret);
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
-#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
- s64 free_blocks, dirty_blocks;
+ s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
@@ -3137,23 +2623,24 @@ static int ext4_nonda_switch(struct super_block *sb)
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
*/
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
- if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
/*
* free block count is less than 150% of dirty blocks
* or free blocks is less than watermark
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb);
-
return 0;
}
@@ -3176,35 +2663,58 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len, flags);
-retry:
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
+
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
/*
* With delayed allocation, we don't log the i_disksize update
* if there is delayed block allocation. But we still need
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
- handle = ext4_journal_start(inode, 1);
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
- page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -3212,11 +2722,16 @@ retry:
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ *pagep = page;
return ret;
}
@@ -3255,17 +2770,9 @@ static int ext4_da_write_end(struct file *file,
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
- if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- if (ext4_should_order_data(inode)) {
- return ext4_ordered_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else if (ext4_should_writeback_data(inode)) {
- return ext4_writeback_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else {
- BUG();
- }
- }
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3276,22 +2783,13 @@ static int ext4_da_write_end(struct file *file,
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
-
new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
-
+ if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- }
up_write(&EXT4_I(inode)->i_data_sem);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
@@ -3300,8 +2798,16 @@ static int ext4_da_write_end(struct file *file,
ext4_mark_inode_dirty(handle, inode);
}
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
+ else
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
+
copied = ret2;
if (ret2 < 0)
ret = ret2;
@@ -3312,7 +2818,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -3321,10 +2828,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -3347,7 +2854,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -3359,10 +2866,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
* doing I/O at all.
*
* We could call write_cache_pages(), and then redirty all of
- * the pages by calling redirty_page_for_writeback() but that
+ * the pages by calling redirty_page_for_writepage() but that
* would be ugly in the extreme. So instead we would need to
* replicate parts of the code in the above functions,
- * simplifying them becuase we wouldn't actually intend to
+ * simplifying them because we wouldn't actually intend to
* write out the pages, but rather only collect contiguous
* logical block extents, call the multi-block allocator, and
* then update the buffer heads with the block allocations.
@@ -3394,6 +2901,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
/*
@@ -3439,63 +2952,77 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
- return mpage_readpage(page, ext4_get_block);
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
+ trace_ext4_readpage(page);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
+ trace_ext4_invalidatepage(page, offset, length);
- if (!page_has_buffers(page))
- return;
- head = bh = page_buffers(page);
- do {
- if (offset <= curr_off && test_clear_buffer_uninit(bh)
- && bh->b_private) {
- ext4_free_io_end(bh->b_private);
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- }
- curr_off = curr_off + bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset, length);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- /*
- * free any io_end structure allocated for buffers to be discarded
- */
- if (ext4_should_dioread_nolock(page->mapping->host))
- ext4_invalidatepage_free_endio(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- if (journal)
- jbd2_journal_invalidatepage(journal, page, offset);
- else
- block_invalidatepage(page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
+ trace_ext4_releasepage(page);
+
+ /* Page has dirty journalled data -> cannot release */
+ if (PageChecked(page))
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page, wait);
@@ -3504,119 +3031,11 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- handle_t *handle;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
- int retries = 0;
-
- if (rw == WRITE) {
- loff_t final_size = offset + count;
-
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
-
-retry:
- if (rw == READ && ext4_should_dioread_nolock(inode))
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL, NULL, 0);
- else {
- ret = blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL);
-
- if (unlikely((rw & WRITE) && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
-
- if (end > isize)
- vmtruncate(inode, isize);
- }
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
-
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-/*
* ext4_get_block used when preparing for a DIO write or buffer write.
* We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete.
*/
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -3625,116 +3044,33 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end)
+ return;
- ext_debug("ext4_end_io_dio(): io_end 0x%p"
- "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- iocb->private = NULL;
-out:
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
- io_end->offset = offset;
- io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
iocb->private = NULL;
-}
-
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
- ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
- struct inode *inode;
- unsigned long flags;
-
- if (!test_clear_buffer_uninit(bh) || !io_end)
- goto out;
-
- if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- printk("sb umounted, discard end_io request for inode %lu\n",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- goto out;
- }
-
- io_end->flag = EXT4_IO_END_UNWRITTEN;
- inode = io_end->inode;
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
-out:
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- clear_buffer_uninit(bh);
- end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
- loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- size_t size = bh->b_size;
-
-retry:
- io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- if (!io_end) {
- if (printk_ratelimit())
- printk(KERN_WARNING "%s: allocation fail\n", __func__);
- schedule();
- goto retry;
- }
io_end->offset = offset;
io_end->size = size;
- /*
- * We need to hold a reference to the page to make sure it
- * doesn't get evicted before ext4_end_io_work() has a chance
- * to convert the extent from written to unwritten.
- */
- io_end->page = page;
- get_page(io_end->page);
-
- bh->b_private = io_end;
- bh->b_end_io = ext4_end_io_buffer_write;
- return 0;
+ ext4_put_io_end(io_end);
}
/*
@@ -3742,13 +3078,13 @@ retry:
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
- * For holes, we fallocate those blocks, mark them as unintialized
- * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
*
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
* when async direct IO completed.
*
* If the O_DIRECT write will extend the file then add this inode to the
@@ -3757,107 +3093,166 @@ retry:
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- size_t count = iov_length(iov, nr_segs);
-
+ size_t count = iov_iter_count(iter);
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
loff_t final_size = offset + count;
- if (rw == WRITE && final_size <= inode->i_size) {
- /*
- * We could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as uninitialized
- * to prevent paralel buffered read to expose the stale data
- * before DIO complete the data IO.
- *
- * As to previously fallocated extents, ext4 get_block
- * will just simply mark the buffer mapped but still
- * keep the extents uninitialized.
- *
- * for non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
- *
- * for async DIO, the conversion needs to be defered when
- * the IO is completed. The ext4 end_io callback function
- * will be called to take care of the conversion work.
- * Here for async case, we allocate an io_end structure to
- * hook to the iocb.
- */
- iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
- if (!is_sync_kiocb(iocb)) {
- iocb->private = ext4_init_io_end(inode, GFP_NOFS);
- if (!iocb->private)
- return -ENOMEM;
- /*
- * we save the io structure for current async
- * direct IO, so that later ext4_map_blocks()
- * could flag the io structure whether there
- * is a unwritten extents needs to be converted
- * when IO is completed.
- */
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ ext4_io_end_t *io_end = NULL;
+
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
+
+ BUG_ON(iocb->private == NULL);
+
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
+
+ if (overwrite) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * unwritten to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents unwritten.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
}
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
- ret = blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio);
- if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ /*
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
+ */
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
/*
- * The io_end structure takes a reference to the inode,
- * that structure needs to be destroyed and the
- * reference to the inode need to be dropped, when IO is
- * complete, even with 0 byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will be
- * desctroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since
- * VFS direct IO won't invoke the end_io call back function,
- * we need to free the end_io structure here.
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ ext4_put_io_end(io_end);
iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the convertion right here
- */
- err = ext4_convert_unwritten_extents(inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- return ret;
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(NULL, inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- /* for write the the end of file case, we fall back to old way */
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ return ret;
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+ else
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
+ return ret;
}
/*
@@ -3879,29 +3274,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
+static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
+ .write_end = ext4_write_end,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3915,13 +3294,14 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -3930,8 +3310,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3945,32 +3324,38 @@ static const struct address_space_operations ext4_da_aops = {
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_order_data(inode))
- inode->i_mapping->a_ops = &ext4_ordered_aops;
- else if (ext4_should_writeback_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
+ inode->i_mapping->a_ops = &ext4_journalled_aops;
+ return;
+ default:
+ BUG();
+ }
+ if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext4_writeback_aops;
else
- inode->i_mapping->a_ops = &ext4_journalled_aops;
+ inode->i_mapping->a_ops = &ext4_aops;
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3980,10 +3365,18 @@ int ext4_block_truncate_page(handle_t *handle,
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
mapping_gfp_mask(mapping) & ~__GFP_FS);
if (!page)
- return -EINVAL;
+ return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
@@ -3997,13 +3390,10 @@ int ext4_block_truncate_page(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- err = 0;
if (buffer_freed(bh)) {
BUFFER_TRACE(bh, "freed: skip");
goto unlock;
}
-
if (!buffer_mapped(bh)) {
BUFFER_TRACE(bh, "unmapped");
ext4_get_block(inode, iblock, bh, 0);
@@ -4026,25 +3416,22 @@ int ext4_block_truncate_page(handle_t *handle,
if (!buffer_uptodate(bh))
goto unlock;
}
-
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto unlock;
}
-
zero_user(page, offset, length);
-
BUFFER_TRACE(bh, "zeroed end of block");
- err = 0;
if (ext4_should_journal_data(inode)) {
err = ext4_handle_dirty_metadata(handle, inode, bh);
} else {
- if (ext4_should_order_data(inode))
- err = ext4_jbd2_file_inode(handle, inode);
+ err = 0;
mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
}
unlock:
@@ -4054,372 +3441,234 @@ unlock:
}
/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-static inline int all_zeroes(__le32 *p, __le32 *q)
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
-/**
- * ext4_find_shared - find the indirect blocks for partial truncation.
- * @inode: inode in question
- * @depth: depth of the affected branch
- * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
- * @chain: place to store the pointers to partial indirect blocks
- * @top: place to the (detached) top of branch
- *
- * This is a helper function used by ext4_truncate().
- *
- * When we do truncate() we may have to clean the ends of several
- * indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
- * from it (and it is on the path to the first completely truncated
- * data block, indeed). We have to free the top of that path along
- * with everything to the right of the path. Since no allocation
- * past the truncation point is possible until ext4_truncate()
- * finishes, we may safely do the latter, but top of branch may
- * require special attention - pageout below the truncation point
- * might try to populate it.
- *
- * We atomically detach the top of branch from the tree, store the
- * block number of its root in *@top, pointers to buffer_heads of
- * partially truncated blocks - in @chain[].bh and pointers to
- * their last elements that should not be removed - in
- * @chain[].p. Return value is the pointer to last filled element
- * of @chain.
- *
- * The work left to caller to do the actual freeing of subtrees:
- * a) free the subtree starting from *@top
- * b) free the subtrees whose roots are stored in
- * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- * c) free the subtrees growing from the inode past the @chain[0].
- * (no partially truncated stuff there). */
-
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
- ext4_lblk_t offsets[4], Indirect chain[4],
- __le32 *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- /* Make k index the deepest non-null offset + 1 */
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = ext4_get_branch(inode, k, offsets, chain, &err);
- /* Writer: pointers */
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p)
- /* Writer: end */
- goto no_top;
- for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- /* Nope, don't do this in ext4. Must leave the tree intact */
-#if 0
- *p->p = 0;
-#endif
- }
- /* Writer: end */
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- while (partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
-/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
- *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
- */
-static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh,
- ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first,
- __le32 *last)
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
{
- __le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
- EXT4_ERROR_INODE(inode, "attempt to clear invalid "
- "blocks %llu len %lu",
- (unsigned long long) block_to_free, count);
- return 1;
- }
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, inode, bh);
- }
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- blocks_for_truncate(inode));
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- ext4_journal_get_write_access(handle, bh);
- }
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
}
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
+ return err;
+}
- for (p = first; p < last; p++)
- *p = 0;
-
- ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+int ext4_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
return 0;
}
-/**
- * ext4_free_data - free a list of data blocks
- * @handle: handle for this transaction
- * @inode: inode we are dealing with
- * @this_bh: indirect buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: points immediately past the end of array
- *
- * We are freeing all blocks refered from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
*
- * We accumulate contiguous runs of blocks to free. Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
*
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
+ * Returns: 0 on success or negative on failure
*/
-static void ext4_free_data(handle_t *handle, struct inode *inode,
- struct buffer_head *this_bh,
- __le32 *first, __le32 *last)
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
- __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
- corresponding to
- block_to_free */
- ext4_fsblk_t nr; /* Current block # */
- __le32 *p; /* Pointer into inode/ind
- for current block */
- int err;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_block_offset, last_block_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
- if (this_bh) { /* For indirect block */
- BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
- /* Important: if we can't update the indirect pointers
- * to the blocks, we can't free them. */
- if (err)
- return;
- }
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
- for (p = first; p < last; p++) {
- nr = le32_to_cpu(*p);
- if (nr) {
- /* accumulate blocks to free if they're contiguous */
- if (count == 0) {
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- } else if (nr == block_to_free + count) {
- count++;
- } else {
- if (ext4_clear_blocks(handle, inode, this_bh,
- block_to_free, count,
- block_to_free_p, p))
- break;
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- }
- }
+ trace_ext4_punch_hole(inode, offset, length, 0);
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
}
- if (count > 0)
- ext4_clear_blocks(handle, inode, this_bh, block_to_free,
- count, block_to_free_p, p);
+ mutex_lock(&inode->i_mutex);
- if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
/*
- * The buffer head should have an attached journal head at this
- * point. However, if the data is corrupted and an indirect
- * block pointed to itself, it would have been detached when
- * the block was cleared. Check for this instead of OOPSing.
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
*/
- if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
- ext4_handle_dirty_metadata(handle, inode, this_bh);
- else
- EXT4_ERROR_INODE(inode,
- "circular indirect block detected at "
- "block %llu",
- (unsigned long long) this_bh->b_blocknr);
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
}
-}
-/**
- * ext4_free_branches - free an array of branches
- * @handle: JBD handle for this transaction
- * @inode: inode we are dealing with
- * @parent_bh: the buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: pointer immediately past the end of array
- * @depth: depth of the branches to free
- *
- * We are freeing all blocks refered from these branches (numbers are
- * stored as little-endian 32-bit) and updating @inode->i_blocks
- * appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
- struct buffer_head *parent_bh,
- __le32 *first, __le32 *last, int depth)
-{
- ext4_fsblk_t nr;
- __le32 *p;
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
- if (ext4_handle_is_aborted(handle))
- return;
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- if (depth--) {
- struct buffer_head *bh;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- p = last;
- while (--p >= first) {
- nr = le32_to_cpu(*p);
- if (!nr)
- continue; /* A hole */
-
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
- EXT4_ERROR_INODE(inode,
- "invalid indirect mapped "
- "block %lu (level %d)",
- (unsigned long) nr, depth);
- break;
- }
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
- /* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
+ }
- /*
- * A read failure? Report error and clear slot
- * (should be rare).
- */
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, nr,
- "Read failure");
- continue;
- }
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
- /* This zaps the entire block. Bottom up. */
- BUFFER_TRACE(bh, "free child branches");
- ext4_free_branches(handle, inode, bh,
- (__le32 *) bh->b_data,
- (__le32 *) bh->b_data + addr_per_block,
- depth);
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- /*
- * Everything below this this pointer has been
- * released. Now let this top-of-subtree go.
- *
- * We want the freeing of this indirect block to be
- * atomic in the journal with the updating of the
- * bitmap block which owns it. So make some room in
- * the journal.
- *
- * We zero the parent pointer *after* freeing its
- * pointee in the bitmaps, so if extend_transaction()
- * for some reason fails to put the bitmap changes and
- * the release into the same transaction, recovery
- * will merely complain about releasing a free block,
- * rather than leaking blocks.
- */
- if (ext4_handle_is_aborted(handle))
- return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- blocks_for_truncate(inode));
- }
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
- /*
- * The forget flag here is critical because if
- * we are journaling (and not doing data
- * journaling), we have to make sure a revoke
- * record is written to prevent the journal
- * replay from overwriting the (former)
- * indirect block if it gets reallocated as a
- * data block. This must happen in the same
- * transaction where the data blocks are
- * actually freed.
- */
- ext4_free_blocks(handle, inode, 0, nr, 1,
- EXT4_FREE_BLOCKS_METADATA|
- EXT4_FREE_BLOCKS_FORGET);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- if (parent_bh) {
- /*
- * The block which we have just freed is
- * pointed to by an indirect block: journal it
- */
- BUFFER_TRACE(parent_bh, "get_write_access");
- if (!ext4_journal_get_write_access(handle,
- parent_bh)){
- *p = 0;
- BUFFER_TRACE(parent_bh,
- "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle,
- inode,
- parent_bh);
- }
- }
- }
- } else {
- /* We have reached the bottom of the tree. */
- BUFFER_TRACE(parent_bh, "free data blocks");
- ext4_free_data(handle, inode, parent_bh, first, last);
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
}
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
-int ext4_can_truncate(struct inode *inode)
+int ext4_inode_attach_jinode(struct inode *inode)
{
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
return 0;
- if (S_ISREG(inode->i_mode))
- return 1;
- if (S_ISDIR(inode->i_mode))
- return 1;
- if (S_ISLNK(inode->i_mode))
- return !ext4_inode_is_fast_symlink(inode);
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
+ }
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
return 0;
}
@@ -4430,7 +3679,7 @@ int ext4_can_truncate(struct inode *inode)
* transaction, and VFS/VM ensures that ext4_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -4453,18 +3702,19 @@ int ext4_can_truncate(struct inode *inode)
*/
void ext4_truncate(struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *i_data = ei->i_data;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ unsigned int credits;
+ handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- __le32 nr = 0;
- int n;
- ext4_lblk_t last_block;
- unsigned blocksize = inode->i_sb->s_blocksize;
+
+ /*
+ * There is a possibility that we're either freeing the inode
+ * or it's a completely new inode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
+ */
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
return;
@@ -4474,31 +3724,39 @@ void ext4_truncate(struct inode *inode)
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ext4_ext_truncate(inode);
- return;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
}
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- return; /* AKPM: return what? */
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
+ }
- last_block = (inode->i_size + blocksize-1)
- >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
- if (inode->i_size & (blocksize - 1))
- if (ext4_block_truncate_page(handle, mapping, inode->i_size))
- goto out_stop;
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
- n = ext4_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
*
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
@@ -4506,96 +3764,23 @@ void ext4_truncate(struct inode *inode)
if (ext4_orphan_add(handle, inode))
goto out_stop;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- /*
- * The orphan list entry will now protect us from any crash which
- * occurs before the truncate completes, so it is now safe to propagate
- * the new, shorter inode size (held for now in i_size) into the
- * on-disk inode. We do this via i_disksize, which is the value which
- * ext4 *really* writes onto the disk inode.
- */
- ei->i_disksize = inode->i_size;
-
- if (n == 1) { /* direct blocks */
- ext4_free_data(handle, inode, NULL, i_data+offsets[0],
- i_data + EXT4_NDIR_BLOCKS);
- goto do_indirects;
- }
-
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (not detached) */
- if (nr) {
- if (partial == chain) {
- /* Shared branch grows from the inode */
- ext4_free_branches(handle, inode, NULL,
- &nr, &nr+1, (chain+n-1) - partial);
- *partial->p = 0;
- /*
- * We mark the inode dirty prior to restart,
- * and prior to stop. No need for it here.
- */
- } else {
- /* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
- ext4_free_branches(handle, inode, partial->bh,
- partial->p,
- partial->p+1, (chain+n-1) - partial);
- }
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
- (__le32*)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees */
- switch (offsets[0]) {
- default:
- nr = i_data[EXT4_IND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
- i_data[EXT4_IND_BLOCK] = 0;
- }
- case EXT4_IND_BLOCK:
- nr = i_data[EXT4_DIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
- i_data[EXT4_DIND_BLOCK] = 0;
- }
- case EXT4_DIND_BLOCK:
- nr = i_data[EXT4_TIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
- i_data[EXT4_TIND_BLOCK] = 0;
- }
- case EXT4_TIND_BLOCK:
- ;
- }
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(handle, inode);
+ else
+ ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
out_stop:
/*
- * If this was a simple ftruncate(), and the file will remain alive
+ * If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
* ext4_delete_inode(), and we allow that function to clean up the
@@ -4604,7 +3789,11 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+
+ trace_ext4_truncate_exit(inode);
}
/*
@@ -4634,18 +3823,15 @@ static int __ext4_get_inode_loc(struct inode *inode,
/*
* Figure out the offset within the block group inode table
*/
- inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- return -EIO;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -4677,7 +3863,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* Is the inode bitmap in cache? */
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -4713,16 +3899,16 @@ make_io:
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
table = ext4_inode_table(sb, gdp);
/* s_inode_readahead_blks is always a power of 2 */
- b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
if (table > b)
b = table;
- end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ end = b + ra_blks;
num = EXT4_INODES_PER_GROUP(sb);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
@@ -4736,9 +3922,10 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4762,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4824,6 +4013,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4833,6 +4035,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -4841,22 +4045,58 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
- iloc.bh = 0;
+ iloc.bh = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ ret = -EIO;
+ goto bad_inode;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = raw_inode->i_generation;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid");
+ ret = -EIO;
+ goto bad_inode;
+ }
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ei->i_state_flags = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4865,8 +4105,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
ret = -ESTALE;
goto bad_inode;
@@ -4874,7 +4115,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4924,36 +4167,27 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- ret = -EIO;
- goto bad_inode;
- }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
- } else
- ei->i_extra_isize = 0;
+ }
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
ret = 0;
@@ -4963,17 +4197,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl);
ret = -EIO;
goto bad_inode;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode)))
- /* Validate extent which is part of inode */
- ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
- ret = ext4_check_inode_blockref(inode);
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
}
if (ret)
goto bad_inode;
@@ -5003,6 +4239,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
} else {
ret = -EIO;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
@@ -5029,7 +4267,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5072,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
+ int need_datasync = 0, set_large_file = 0;
+ uid_t i_uid;
+ gid_t i_gid;
- /* For fields not not tracking in the in-memory inode,
+ spin_lock(&ei->i_raw_lock);
+
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if (!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -5112,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
- ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, NULL,
- EXT4_SB(sb)->s_sbh);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -5156,25 +4389,43 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else
+ } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
+ }
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
+ ext4_inode_csum_set(inode, raw_inode, ei);
+
+ spin_unlock(&ei->i_raw_lock);
+
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
-
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -5186,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -5212,15 +4462,15 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -5230,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -5240,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
@@ -5253,6 +4512,48 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
+
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
+}
+
+/*
* ext4_setattr()
*
* Called from notify_change.
@@ -5289,14 +4590,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5316,60 +4618,82 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
- }
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
- handle_t *handle;
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- if (ext4_handle_valid(handle)) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error) {
- /* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
+ if (error)
goto err_out;
- }
- ext4_orphan_del(handle, inode);
- orphan = 0;
- ext4_journal_stop(handle);
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
goto err_out;
}
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ /*
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
+ */
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
+
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
- /* ext4_truncate will clear the flag */
- if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
- ext4_truncate(inode);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, inode->i_size);
}
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- rc = vmtruncate(inode, attr->ia_size);
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
@@ -5384,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -5397,12 +4721,21 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
/*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
* We can't update i_blocks if the block allocation is delayed
* otherwise in the case of system crash before the real block
* allocation is done, we will have i_blocks inconsistent with
@@ -5412,42 +4745,18 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* will return the blocks that include the delayed allocation
* blocks for this file.
*/
- delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
return 0;
}
-static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
- int chunk)
-{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, it need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
- * 2 dindirect blocks
- * 1 tindirect block
- */
- indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
- return indirects + 3;
- }
- /*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
- */
- indirects = nrblocks * 2 + 1;
- return indirects;
-}
-
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -5456,12 +4765,13 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiuguous, with flexbg,
+ * different block groups too. If they are contiguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5469,14 +4779,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -5484,12 +4790,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -5506,7 +4807,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
}
/*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
* the modification of a single pages into a single transaction,
* which may include multiple chunks of block allocations.
*
@@ -5520,7 +4821,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
@@ -5551,7 +4852,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
+ if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
@@ -5632,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -5700,11 +4993,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
@@ -5766,9 +5059,23 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -5780,15 +5087,18 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
- else
+ else {
+ jbd2_journal_flush(journal);
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -5810,66 +5120,85 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret = -EINVAL;
- void *fsdata;
+ int ret;
struct file *file = vma->vm_file;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
- /*
- * Get i_alloc_sem to stop truncates messing with the inode. We cannot
- * get i_mutex because we are already holding mmap_sem.
- */
- down_read(&inode->i_alloc_sem);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
+ }
+
+ lock_page(page);
size = i_size_read(inode);
- if (page->mapping != mapping || size <= page_offset(page)
- || !PageUptodate(page)) {
- /* page got truncated from under us? */
- goto out_unlock;
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
}
- ret = 0;
- if (PageMappedToDisk(page))
- goto out_unlock;
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
-
- lock_page(page);
/*
- * return if we have all the buffers mapped. This avoid
- * the need to call write_begin/write_end which does a
- * journal_start/journal_stop which can block and take
- * long time
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped)) {
- unlock_page(page);
- goto out_unlock;
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
+ /* Wait so that we don't change page under IO */
+ wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
}
}
unlock_page(page);
- /*
- * OK, we need to fill the hole... Do write_begin write_end
- * to do block allocation/reservation.We are not holding
- * inode.i__mutex here. That allow * parallel write_begin,
- * write_end call. lock_page prevent this from happening
- * on the same page though
- */
- ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- len, len, page, fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = 0;
-out_unlock:
- if (ret)
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
- up_read(&inode->i_alloc_sem);
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1b..0f2252ec274 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,9 +18,190 @@
#include "ext4_jbd2.h"
#include "ext4.h"
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a: pointer to first memory area
+ * @b: pointer to second memory area
+ * @len: number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+ unsigned char *ap, *bp;
+ unsigned char tmp;
+
+ ap = (unsigned char *)a;
+ bp = (unsigned char *)b;
+ while (len-- > 0) {
+ tmp = *ap;
+ *ap = *bp;
+ *bp = tmp;
+ ap++;
+ bp++;
+ }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1: pointer to first inode
+ * @inode2: pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+ loff_t isize;
+ struct ext4_inode_info *ei1;
+ struct ext4_inode_info *ei2;
+
+ ei1 = EXT4_I(inode1);
+ ei2 = EXT4_I(inode2);
+
+ memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+ memswap(&inode1->i_version, &inode2->i_version,
+ sizeof(inode1->i_version));
+ memswap(&inode1->i_blocks, &inode2->i_blocks,
+ sizeof(inode1->i_blocks));
+ memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+ memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+ memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+ memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+ memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode1);
+ ext4_es_lru_del(inode2);
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+ i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb: the super block of the filesystem
+ * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+ struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+ struct inode *inode_bl;
+ struct ext4_inode_info *ei_bl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ if (IS_ERR(inode_bl))
+ return PTR_ERR(inode_bl);
+ ei_bl = EXT4_I(inode_bl);
+
+ filemap_flush(inode->i_mapping);
+ filemap_flush(inode_bl->i_mapping);
+
+ /* Protect orig inodes against a truncate and make sure,
+ * that only 1 swap_inode_boot_loader is running. */
+ lock_two_nondirectories(inode, inode_bl);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ ext4_inode_block_unlocked_dio(inode_bl);
+ inode_dio_wait(inode);
+ inode_dio_wait(inode_bl);
+
+ handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+ if (IS_ERR(handle)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(inode, inode_bl);
+
+ if (inode_bl->i_nlink == 0) {
+ /* this inode has never been used as a BOOT_LOADER */
+ set_nlink(inode_bl, 1);
+ i_uid_write(inode_bl, 0);
+ i_gid_write(inode_bl, 0);
+ inode_bl->i_flags = 0;
+ ei_bl->i_flags = 0;
+ inode_bl->i_version = 1;
+ i_size_write(inode_bl, 0);
+ inode_bl->i_mode = S_IFREG;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode_bl);
+ } else
+ memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+ }
+
+ swap_inode_data(inode, inode_bl);
+
+ inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ inode_bl->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ext4_discard_preallocations(inode);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ } else {
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ ext4_journal_stop(handle);
+ ext4_double_up_write_data_sem(inode, inode_bl);
+
+journal_err_out:
+ ext4_inode_resume_unlocked_dio(inode);
+ ext4_inode_resume_unlocked_dio(inode_bl);
+ unlock_two_nondirectories(inode, inode_bl);
+ iput(inode_bl);
+ return err;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
@@ -35,16 +216,16 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
handle_t *handle = NULL;
int err, migrate = 0;
struct ext4_iloc iloc;
- unsigned int oldflags;
+ unsigned int oldflags, mask, i;
unsigned int jflag;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -80,17 +261,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
- if (oldflags & EXT4_EXTENTS_FL) {
- /* We don't support clearning extent flags */
- if (!(flags & EXT4_EXTENTS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (flags & EXT4_EXTENTS_FL) {
- /* migrate the file */
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
- flags &= ~EXT4_EXTENTS_FL;
- }
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
@@ -101,7 +273,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
} else if (oldflags & EXT4_EOFBLOCKS_FL)
ext4_truncate(inode);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto flags_out;
@@ -112,9 +284,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (err)
goto flags_err;
- flags = flags & EXT4_FL_USER_MODIFIABLE;
- flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
- ei->i_flags = flags;
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
ext4_set_inode_flags(inode);
inode->i_ctime = ext4_current_time(inode);
@@ -129,11 +306,16 @@ flags_err:
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
- if (migrate)
- err = ext4_ext_migrate(inode);
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
flags_out:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -146,10 +328,17 @@ flags_out:
__u32 generation;
int err;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ ext4_warning(sb, "Setting inode version is not "
+ "supported with metadata_csum enabled.");
+ return -ENOTTY;
+ }
+
+ err = mnt_want_write_file(filp);
if (err)
return err;
if (get_user(generation, (int __user *) arg)) {
@@ -157,10 +346,11 @@ flags_out:
goto setversion_out;
}
- handle = ext4_journal_start(inode, 1);
+ mutex_lock(&inode->i_mutex);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto setversion_out;
+ goto unlock_out;
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
@@ -169,48 +359,37 @@ flags_out:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
setversion_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- return ret;
- }
-#endif
case EXT4_IOC_GROUP_EXTEND: {
ext4_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
int err, err2=0;
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
+ }
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_extend_out;
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
@@ -220,14 +399,15 @@ setversion_out:
}
if (err == 0)
err = err2;
- mnt_drop_write(filp->f_path.mnt);
-
+ mnt_drop_write_file(filp);
+group_extend_out:
+ ext4_resize_end(sb);
return err;
}
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct file *donor_filp;
+ struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -239,48 +419,64 @@ setversion_out:
return -EFAULT;
me.moved_len = 0;
- donor_filp = fget(me.donor_fd);
- if (!donor_filp)
+ donor = fdget(me.donor_fd);
+ if (!donor.file)
return -EBADF;
- if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ if (!(donor.file->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}
- err = mnt_want_write(filp->f_path.mnt);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto mext_out;
+ }
+
+ err = mnt_want_write_file(filp);
if (err)
goto mext_out;
- err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ err = ext4_move_extents(filp, donor.file, me.orig_start,
me.donor_start, me.len, &me.moved_len);
- mnt_drop_write(filp->f_path.mnt);
- if (me.moved_len > 0)
- file_remove_suid(donor_filp);
+ mnt_drop_write_file(filp);
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
mext_out:
- fput(donor_filp);
+ fdput(donor);
return err;
}
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
- struct super_block *sb = inode->i_sb;
int err, err2=0;
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
- err = mnt_want_write(filp->f_path.mnt);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
+ err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_add_out;
err = ext4_group_add(sb, &input);
if (EXT4_SB(sb)->s_journal) {
@@ -290,18 +486,22 @@ mext_out:
}
if (err == 0)
err = err2;
- mnt_drop_write(filp->f_path.mnt);
-
+ mnt_drop_write_file(filp);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input.group);
+group_add_out:
+ ext4_resize_end(sb);
return err;
}
case EXT4_IOC_MIGRATE:
{
int err;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
/*
@@ -313,24 +513,104 @@ mext_out:
mutex_lock(&(inode->i_mutex));
err = ext4_ext_migrate(inode);
mutex_unlock(&(inode->i_mutex));
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_ALLOC_DA_BLKS:
{
int err;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
err = ext4_alloc_da_blocks(inode);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
+ case EXT4_IOC_SWAP_BOOT:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ return swap_inode_boot_loader(sb, inode);
+
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
+ return -EFAULT;
+ }
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+ if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
+resizefs_out:
+ ext4_resize_end(sb);
+ return err;
+ }
+
+ case FITRIM:
+ {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = ext4_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
+
default:
return -ENOTTY;
}
@@ -362,11 +642,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETVERSION_OLD:
cmd = EXT4_IOC_SETVERSION_OLD;
break;
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC32_WAIT_FOR_READONLY:
- cmd = EXT4_IOC_WAIT_FOR_READONLY;
- break;
-#endif
case EXT4_IOC32_GETRSVSZ:
cmd = EXT4_IOC_GETRSVSZ;
break;
@@ -397,6 +672,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
case EXT4_IOC_MOVE_EXT:
+ case FITRIM:
+ case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d5..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,11 +21,20 @@
* mballoc.c contains the multiblocks allocation routines
*/
+#include "ext4_jbd2.h"
#include "mballoc.h"
-#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -70,13 +79,13 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> length for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
- * space we will consume the particular prealloc space. This make sure that
- * that the we have contiguous physical blocks representing the file blocks
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
*
* The important thing to be noted in case of inode prealloc space is that
* we don't modify the values associated to inode prealloc space except
@@ -84,7 +93,7 @@
*
* If we are not able to find blocks in the inode prealloc space and if we
* have the group allocation flag set then we look at the locality group
- * prealloc space. These are per CPU prealloc list repreasented as
+ * prealloc space. These are per CPU prealloc list represented as
*
* ext4_sb_info.s_locality_groups[smp_processor_id()]
*
@@ -92,7 +101,7 @@
* between CPUs. It is possible to get scheduled at this point.
*
* The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
*
* If we can't allocate blocks via inode prealloc or/and locality group
* prealloc then we look at the buddy cache. The buddy cache is represented
@@ -126,14 +135,16 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
* 512 blocks. This can be tuned via
- * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * stripe value (sbi->s_stripe)
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
*
- * The regular allocator(using the buddy cache) supports few tunables.
+ * The regular allocator (using the buddy cache) supports a few tunables.
*
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
@@ -152,7 +163,7 @@
* best extent in the found extents. Searching for the blocks starts with
* the group specified as the goal value in allocation context via
* ac_g_ex. Each group is first checked based on the criteria whether it
- * can used for allocation. ext4_mb_good_group explains how the groups are
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
* Both the prealloc space are getting populated as above. So for the first
@@ -337,20 +348,26 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
/* We create slab caches for groupinfo data structures based on the
* superblock block size. There will be one per mounted filesystem for
* each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES \
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+ "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+ "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+ "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -388,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
+}
+
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -418,7 +441,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(max == NULL);
if (order > e4b->bd_blkbits + 1) {
@@ -427,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
}
/* at order 0 we see each particular block */
- *max = 1 << (e4b->bd_blkbits + 3);
- if (order == 0)
- return EXT4_MB_BITMAP(e4b);
+ if (order == 0) {
+ *max = 1 << (e4b->bd_blkbits + 3);
+ return e4b->bd_bitmap;
+ }
- bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
return bb;
@@ -452,7 +476,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += first + i;
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -486,10 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
b2 = (unsigned char *) bitmap;
for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
if (b1[i] != b2[i]) {
- printk(KERN_ERR "corruption in group %u "
- "at byte %u(%u): %x in copy != %x "
- "on disk/prealloc\n",
- e4b->bd_group, i, i * 8, b1[i], b2[i]);
+ ext4_msg(e4b->bd_sb, KERN_ERR,
+ "corruption in group %u "
+ "at byte %u(%u): %x in copy != %x "
+ "on disk/prealloc",
+ e4b->bd_group, i, i * 8, b1[i], b2[i]);
BUG();
}
}
@@ -572,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 0 */
+ /* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
for (j = 0; j < (1 << order); j++) {
k = (i * (1 << order)) + j;
MB_CHECK_ASSERT(
- !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+ !mb_test_bit(k, e4b->bd_bitmap));
}
count++;
}
@@ -611,7 +636,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
- buddy = mb_find_buddy(e4b, 0, &max);
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -630,7 +654,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
#define mb_check_buddy(e4b)
#endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
@@ -641,7 +670,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t chunk;
unsigned short border;
- BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
@@ -693,7 +722,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
ext4_grpblk_t len;
@@ -722,13 +752,18 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u blocks in bitmap, %u in gd",
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -741,6 +776,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -769,14 +822,15 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
int groups_per_page;
int err = 0;
int i;
- ext4_group_t first_group;
+ ext4_group_t first_group, group;
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
- struct buffer_head **bh;
+ struct buffer_head **bh = NULL;
struct inode *inode;
char *data;
char *bitmap;
+ struct ext4_group_info *grinfo;
mb_debug(1, "init page %lu\n", page->index);
@@ -792,95 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
- err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, GFP_NOFS);
- if (bh == NULL)
+ if (bh == NULL) {
+ err = -ENOMEM;
goto out;
+ }
} else
bh = &bhs;
first_group = page->index * blocks_per_page / 2;
/* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
- struct ext4_group_desc *desc;
-
- if (first_group + i >= ngroups)
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
break;
- err = -EIO;
- desc = ext4_get_group_desc(sb, first_group + i, NULL);
- if (desc == NULL)
- goto out;
-
- err = -ENOMEM;
- bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
- if (bh[i] == NULL)
- goto out;
-
- if (bitmap_uptodate(bh[i]))
- continue;
-
- lock_buffer(bh[i]);
- if (bitmap_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
- }
- ext4_lock_group(sb, first_group + i);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh[i],
- first_group + i, desc);
- set_bitmap_uptodate(bh[i]);
- set_buffer_uptodate(bh[i]);
- ext4_unlock_group(sb, first_group + i);
- unlock_buffer(bh[i]);
+ grinfo = ext4_get_group_info(sb, group);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
continue;
}
- ext4_unlock_group(sb, first_group + i);
- if (buffer_uptodate(bh[i])) {
- /*
- * if not uninit if bh is uptodate,
- * bitmap is also uptodate
- */
- set_bitmap_uptodate(bh[i]);
- unlock_buffer(bh[i]);
- continue;
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
+ goto out;
}
- get_bh(bh[i]);
- /*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
- */
- set_bitmap_uptodate(bh[i]);
- bh[i]->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh[i]);
- mb_debug(1, "read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page && bh[i]; i++)
- wait_on_buffer(bh[i]);
-
- err = -EIO;
- for (i = 0; i < groups_per_page && bh[i]; i++)
- if (!buffer_uptodate(bh[i]))
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
goto out;
+ }
+ }
- err = 0;
first_block = page->index * blocks_per_page;
- /* init the page */
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) {
- int group;
- struct ext4_group_info *grinfo;
-
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -909,6 +926,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
* incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group);
ext4_unlock_group(sb, group);
incore = NULL;
@@ -938,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out:
if (bh) {
- for (i = 0; i < groups_per_page && bh[i]; i++)
+ for (i = 0; i < groups_per_page; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -947,22 +966,21 @@ out:
}
/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
int blocks_per_page;
- int groups_per_page;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- ext4_group_t first_group;
- struct ext4_group_info *grp;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/*
@@ -972,57 +990,39 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
*/
block = group * 2;
pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
-
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
- /* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- if ((first_group + i) >= ngroups)
- break;
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- down_write_nested(&grp->alloc_sem, i);
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
}
- return i;
+
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
}
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
- int blocks_per_page;
- ext4_group_t first_group;
- struct ext4_group_info *grp;
-
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
- /* release locks on all the groups */
- for (i = 0; i < locked_group; i++) {
-
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- up_write(&grp->alloc_sem);
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
}
-
}
/*
@@ -1034,93 +1034,61 @@ static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
- int ret = 0;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
+ might_sleep();
mb_debug(1, "init group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have taken the alloc_sem lock.
+ * would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
* return without doing anything
*/
- ret = 0;
goto err;
}
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
+ if (e4b.bd_buddy_page == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
* init the buddy
*/
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
+ ret = 0;
+ goto err;
}
- if (page == NULL || !PageUptodate(page)) {
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
+ ext4_mb_put_buddy_page_lock(&e4b);
return ret;
}
@@ -1143,35 +1111,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
+ might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
- e4b->bd_info = ext4_get_group_info(sb, group);
+ e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- e4b->alloc_semp = &grp->alloc_sem;
-
- /* Take the read lock on the group alloc
- * sem. This would make sure a parallel
- * ext4_mb_init_group happening on other
- * groups mapped by the page is blocked
- * till we are done with allocation
- */
-repeat_load_buddy:
- down_read(e4b->alloc_semp);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- /* we need to check for group need init flag
- * with alloc_semp held so that we can be sure
- * that new blocks didn't get added to the group
- * when we are loading the buddy cache
- */
- up_read(e4b->alloc_semp);
/*
* we need full data about the group
* to make a good selection
@@ -1179,7 +1132,6 @@ repeat_load_buddy:
ret = ext4_mb_init_group(sb, group);
if (ret)
return ret;
- goto repeat_load_buddy;
}
/*
@@ -1193,7 +1145,7 @@ repeat_load_buddy:
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1220,19 +1172,24 @@ repeat_load_buddy:
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1249,13 +1206,18 @@ repeat_load_buddy:
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1263,15 +1225,14 @@ repeat_load_buddy:
return 0;
err:
+ if (page)
+ page_cache_release(page);
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
-
- /* Done with the buddy cache */
- up_read(e4b->alloc_semp);
return ret;
}
@@ -1281,9 +1242,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
- /* Done with the buddy cache */
- if (e4b->alloc_semp)
- up_read(e4b->alloc_semp);
}
@@ -1292,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
int order = 1;
void *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = EXT4_MB_BUDDY(e4b);
+ bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
@@ -1326,7 +1284,34 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
-static void mb_set_bits(void *bm, int cur, int len)
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1344,18 +1329,95 @@ static void mb_set_bits(void *bm, int cur, int len)
}
}
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+ int first, int count)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
+ BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1363,83 +1425,77 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
- if (block && max)
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+ if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u); block bitmap corrupt.",
+ block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
+
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
- else if (!block && !max)
+ else if (!left_is_free && !right_is_free)
e4b->bd_info->bb_fragments++;
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
-
- if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
- ext4_fsblk_t blocknr;
-
- blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += block;
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u)", block);
- }
- mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
-
- if (!buddy2)
- break;
-
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
- int ord;
+ int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1449,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1466,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
next = (block + 1) * (1 << order);
- if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+ if (mb_test_bit(next, e4b->bd_bitmap))
break;
- ord = mb_find_order_for_block(e4b, next);
+ order = mb_find_order_for_block(e4b, next);
- order = ord;
block = next >> order;
ex->fe_len += 1 << order;
}
@@ -1510,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain fragments counter */
if (start != 0)
- mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
- max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
if (mlen && max)
e4b->bd_info->bb_fragments++;
else if (!mlen && !max)
@@ -1555,7 +1607,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1596,9 +1648,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
- /* on allocation we use ac to track the held semaphore */
- ac->alloc_semp = e4b->alloc_semp;
- e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -1644,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1670,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;
BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
- BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
@@ -1736,7 +1785,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1757,18 +1806,27 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
@@ -1857,7 +1915,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
@@ -1869,25 +1927,25 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
- EXT4_BLOCKS_PER_GROUP(sb), i);
- if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
* we have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But bitmap says 0",
free);
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
/*
@@ -1897,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1917,7 +1975,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
{
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
@@ -1933,11 +1991,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -1957,6 +2016,15 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
+ free = grp->bb_free;
+ if (free == 0)
+ return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1964,10 +2032,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
- free = grp->bb_free;
fragments = grp->bb_fragments;
- if (free == 0)
- return 0;
if (fragments == 0)
return 0;
@@ -1975,15 +2040,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
- if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
-
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -2074,7 +2143,12 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- if (group == ngroups)
+ cond_resched();
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
@@ -2098,7 +2172,7 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2171,8 +2245,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct super_block *sb = seq->private;
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
- int err;
+ int err, buddy_loaded = 0;
struct ext4_buddy e4b;
+ struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
ext4_grpblk_t counters[16];
@@ -2189,15 +2264,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
- return 0;
+ grinfo = ext4_get_group_info(sb, group);
+ /* Load the group info in memory only if not already loaded. */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ buddy_loaded = 1;
}
- ext4_lock_group(sb, group);
+
memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
+
+ if (buddy_loaded)
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2222,7 +2303,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
{
- struct super_block *sb = PDE(inode)->data;
+ struct super_block *sb = PDE_DATA(inode);
int rc;
rc = seq_open(file, &ext4_mb_seq_groups_ops);
@@ -2251,6 +2332,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
+
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
+
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
+}
+
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
@@ -2271,8 +2385,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
EXT4_DESC_PER_BLOCK_BITS(sb);
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
+ "for a buddy group");
goto exit_meta_group_info;
}
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
@@ -2283,12 +2397,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
- memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
@@ -2298,10 +2411,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
*/
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
meta_group_info[i]->bb_free =
- ext4_free_blocks_after_init(sb, group, desc);
+ ext4_free_clusters_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- ext4_free_blks_count(sb, desc);
+ ext4_free_group_clusters(sb, desc);
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2327,8 +2440,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
exit_group_info:
/* If a meta_group_info table has been allocated, release it now */
- if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ }
exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
@@ -2338,61 +2453,29 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- int num_meta_group_infos;
- int num_meta_group_infos_max;
- int array_size;
+ int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
- /* This is the number of blocks used by GDT */
- num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
- 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
- /*
- * This is the total number of blocks used by GDT including
- * the number of reserved blocks for GDT.
- * The s_group_info array is allocated with this value
- * to allow a clean online resize without a complex
- * manipulation of pointer.
- * The drawback is the unused memory when no resize
- * occurs but it's very low in terms of pages
- * (see comments below)
- * Need to handle this properly when META_BG resizing is allowed
- */
- num_meta_group_infos_max = num_meta_group_infos +
- le16_to_cpu(es->s_reserved_gdt_blocks);
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /*
- * array_size is the size of s_group_info array. We round it
- * to the next power of two because this approximation is done
- * internally by kmalloc so we can have some more memory
- * for free here (e.g. may be used for META_BG resize).
- */
- array_size = 1;
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
- printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+ ext4_msg(sb, KERN_ERR, "can't get new inode");
goto err_freesgi;
}
- sbi->s_buddy_cache->i_ino = get_next_ino();
+ /* To avoid potentially colliding with an valid on-disk inode number,
+ * use EXT4_BAD_INO for the buddy cache inode number. This inode is
+ * not in the inode hash, so it should never be found by iget(), but
+ * this will avoid confusion if it ever shows up during debugging. */
+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
- printk(KERN_ERR
- "EXT4-fs: can't read descriptor %u\n", i);
+ ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
goto err_freebuddy;
}
if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2405,25 +2488,72 @@ err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
- i = num_meta_group_infos;
+ i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
err_freesgi:
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
return -ENOMEM;
}
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+static void ext4_groupinfo_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+ if (ext4_groupinfo_caches[i])
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ ext4_groupinfo_caches[i] = NULL;
+ }
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+ static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+ int slab_size;
+ int blocksize_bits = order_base_2(size);
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep;
+
+ if (cache_index >= NR_GRPINFO_CACHES)
+ return -EINVAL;
+
+ if (unlikely(cache_index < 0))
+ cache_index = 0;
+
+ mutex_lock(&ext4_grpinfo_slab_create_mutex);
+ if (ext4_groupinfo_caches[cache_index]) {
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ return 0; /* Already created */
+ }
+
+ slab_size = offsetof(struct ext4_group_info,
+ bb_counters[blocksize_bits + 2]);
+
+ cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+ slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+ NULL);
+
+ ext4_groupinfo_caches[cache_index] = cachep;
+
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ if (!cachep) {
+ printk(KERN_EMERG
+ "EXT4-fs: no memory for groupinfo slab cache\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
unsigned offset;
unsigned max;
int ret;
- int cache_index;
- struct kmem_cache *cachep;
- char *namep = NULL;
i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
@@ -2440,30 +2570,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
goto out;
}
- cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
- cachep = ext4_groupinfo_caches[cache_index];
- if (!cachep) {
- char name[32];
- int len = offsetof(struct ext4_group_info,
- bb_counters[sb->s_blocksize_bits + 2]);
-
- sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
- namep = kstrdup(name, GFP_KERNEL);
- if (!namep) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Need to free the kmem_cache_name() when we
- * destroy the slab */
- cachep = kmem_cache_create(namep, len, 0,
- SLAB_RECLAIM_ACCOUNT, NULL);
- if (!cachep) {
- ret = -ENOMEM;
- goto out;
- }
- ext4_groupinfo_caches[cache_index] = cachep;
- }
+ ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+ if (ret < 0)
+ goto out;
/* order 0 is regular bitmap */
sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2480,12 +2589,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i++;
} while (i <= sb->s_blocksize_bits + 1);
- /* init file for buddy data */
- ret = ext4_mb_init_backend(sb);
- if (ret != 0) {
- goto out;
- }
-
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
@@ -2494,7 +2597,32 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
+ /*
+ * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+ * to the lowest multiple of s_stripe which is bigger than
+ * the s_mb_group_prealloc as determined above. We want
+ * the preallocation size to be an exact multiple of the
+ * RAID stripe size so that preallocations don't fragment
+ * the stripes.
+ */
+ if (sbi->s_stripe > 1) {
+ sbi->s_mb_group_prealloc = roundup(
+ sbi->s_mb_group_prealloc, sbi->s_stripe);
+ }
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
@@ -2510,18 +2638,25 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+ goto out_free_locality_groups;
+
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
- if (sbi->s_journal)
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+ return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
out:
- if (ret) {
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- kfree(namep);
- }
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
return ret;
}
@@ -2552,6 +2687,9 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2568,145 +2706,119 @@ int ext4_mb_release(struct super_block *sb)
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
kfree(sbi->s_group_info[i]);
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
if (sbi->s_buddy_cache)
iput(sbi->s_buddy_cache);
if (sbi->s_mb_stats) {
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u blocks %u reqs (%u success)",
atomic_read(&sbi->s_bal_allocated),
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
- "%u 2^N hits, %u breaks, %u lost\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u extents scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
- sbi->s_mb_buddies_generated++,
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %lu generated and it took %Lu",
+ sbi->s_mb_buddies_generated,
sbi->s_mb_generation_time);
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
atomic_read(&sbi->s_mb_discarded));
}
free_percpu(sbi->s_locality_groups);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t block, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
- int ret;
ext4_fsblk_t discard_block;
- discard_block = block + ext4_group_first_block_no(sb, block_group);
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
- if (ret == -EOPNOTSUPP) {
- ext4_warning(sb, "discard not supported, disabling");
- clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
- }
- return ret;
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
/*
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- struct super_block *sb = journal->j_private;
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- struct ext4_free_data *entry;
- struct list_head *l, *ltmp;
-
- list_for_each_safe(l, ltmp, &txn->t_private_list) {
- entry = list_entry(l, struct ext4_free_data, list);
-
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
- entry->count, entry->group, entry);
-
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
-
- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);
-
- db = e4b.bd_info;
- /* there are blocks to put in buddy to make them really free */
- count += entry->count;
- count2++;
- ext4_lock_group(sb, entry->group);
- /* Take it out of per group rb tree */
- rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
-
- if (!db->bb_free_root.rb_node) {
- /* No more items in the per group rb tree
- * balance refcounts from ext4_mb_free_metadata()
- */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
- }
- ext4_unlock_group(sb, entry->group);
- kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_unload_buddy(&e4b);
- }
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
-}
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
-static void __init ext4_create_debugfs_entry(void)
-{
- debugfs_dir = debugfs_create_dir("ext4", NULL);
- if (debugfs_dir)
- debugfs_debug = debugfs_create_u8("mballoc-debug",
- S_IRUGO | S_IWUSR,
- debugfs_dir,
- &mb_enable_debug);
-}
-static void ext4_remove_debugfs_entry(void)
-{
- debugfs_remove(debugfs_debug);
- debugfs_remove(debugfs_dir);
-}
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
-#else
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
-static void __init ext4_create_debugfs_entry(void)
-{
-}
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
-static void ext4_remove_debugfs_entry(void)
-{
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-#endif
-
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2721,20 +2833,18 @@ int __init ext4_init_mballoc(void)
return -ENOMEM;
}
- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
- SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_ext_cachep == NULL) {
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
- ext4_create_debugfs_entry();
return 0;
}
void ext4_exit_mballoc(void)
{
- int i;
/*
* Wait for completion of call_rcu()'s on ext4_pspace_cachep
* before destroying the slab cache.
@@ -2742,17 +2852,8 @@ void ext4_exit_mballoc(void)
rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
- kmem_cache_destroy(ext4_free_ext_cachep);
-
- for (i = 0; i < NR_GRPINFO_CACHES; i++) {
- struct kmem_cache *cachep = ext4_groupinfo_caches[i];
- if (cachep) {
- char *name = (char *)kmem_cache_name(cachep);
- kmem_cache_destroy(cachep);
- kfree(name);
- }
- }
- ext4_remove_debugfs_entry();
+ kmem_cache_destroy(ext4_free_data_cachep);
+ ext4_groupinfo_destroy_slabs();
}
@@ -2762,7 +2863,7 @@ void ext4_exit_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_blks)
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
@@ -2783,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -2793,25 +2895,26 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
goto out_err;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
- ext4_free_blks_count(sb, gdp));
+ ext4_free_group_clusters(sb, gdp));
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- len = ac->ac_b_ex.fe_len;
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
- "fs metadata\n", block, block+len);
+ "fs metadata", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!err)
@@ -2829,31 +2932,34 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp,
- ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_blks_set(sb, gdp, len);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
- atomic_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2862,15 +2968,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
- ext4_mark_super_dirty(sb);
brelse(bitmap_bh);
return err;
}
/*
* here we normalize request for locality group
- * Group request are normalized to s_strip size if we set the same via mount
- * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
* /sys/fs/ext4/<partition>/mb_group_prealloc
*
* XXX: should we try to preallocate more than the group has now?
@@ -2881,10 +2987,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg = ac->ac_lg;
BUG_ON(lg == NULL);
- if (EXT4_SB(sb)->s_stripe)
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
- else
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+ ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -2897,9 +3000,11 @@ static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
- loff_t size, orig_size, start_off;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *pa;
@@ -2927,7 +3032,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -2999,7 +3104,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3029,9 +3135,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3040,9 +3148,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
if (start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical) {
- printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
- (unsigned long) start, (unsigned long) size,
- (unsigned long) ac->ac_o_ex.fe_logical);
+ ext4_msg(ac->ac_sb, KERN_ERR,
+ "start %lu, size %lu, fe_logical %lu",
+ (unsigned long) start, (unsigned long) size,
+ (unsigned long) ac->ac_o_ex.fe_logical);
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
@@ -3053,7 +3162,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3107,13 +3216,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
- int len;
-
- if (pa && pa->pa_type == MB_INODE_PA) {
- len = ac->ac_b_ex.fe_len;
- pa->pa_free += len;
- }
+ if (pa && pa->pa_type == MB_INODE_PA)
+ pa->pa_free += ac->ac_b_ex.fe_len;
}
/*
@@ -3122,14 +3227,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3137,7 +3244,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
@@ -3188,7 +3295,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
cur_distance = abs(goal_block - cpa->pa_pstart);
new_distance = abs(goal_block - pa->pa_pstart);
- if (cur_distance < new_distance)
+ if (cur_distance <= new_distance)
return cpa;
/* drop the previous reference */
@@ -3203,6 +3310,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
@@ -3220,12 +3328,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3300,8 +3410,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
n = rb_first(&(grp->bb_free_root));
while (n) {
- entry = rb_entry(n, struct ext4_free_data, node);
- mb_set_bits(bitmap, entry->start_blk, entry->count);
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -3322,7 +3432,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t groupnr;
ext4_grpblk_t start;
int preallocated = 0;
- int count = 0;
int len;
/* all form of preallocation discards first load group,
@@ -3343,9 +3452,8 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- mb_set_bits(bitmap, start, len);
+ ext4_set_bits(bitmap, start, len);
preallocated += len;
- count++;
}
mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
@@ -3354,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3367,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_group_t grp;
ext4_fsblk_t grp_blk;
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
-
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -3388,7 +3501,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3422,6 +3535,7 @@ static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3453,16 +3567,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3487,7 +3603,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3602,7 +3718,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3617,16 +3733,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
free += next - bit;
trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
- trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
- grp_blk_start + bit, next - bit);
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
+ next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
if (free != pa->pa_free) {
- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
- pa, (unsigned long) pa->pa_lstart,
- (unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
+ ext4_msg(e4b->bd_sb, KERN_CRIT,
+ "pa %p: logic %lu, phys. %lu, len %lu",
+ pa, (unsigned long) pa->pa_lstart,
+ (unsigned long) pa->pa_pstart,
+ (unsigned long) pa->pa_len);
ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
free, pa->pa_free);
/*
@@ -3699,7 +3817,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
}
if (needed == 0)
- needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
INIT_LIST_HEAD(&list);
repeat:
@@ -3733,11 +3851,7 @@ repeat:
if (free < needed && busy) {
busy = 0;
ext4_unlock_group(sb, group);
- /*
- * Yield the CPU here so that we don't get soft lockup
- * in non preempt case.
- */
- yield();
+ cond_resched();
goto repeat;
}
@@ -3814,7 +3928,8 @@ repeat:
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
spin_unlock(&ei->i_prealloc_lock);
- printk(KERN_ERR "uh-oh! used pa while discarding\n");
+ ext4_msg(sb, KERN_ERR,
+ "uh-oh! used pa while discarding");
WARN_ON(1);
schedule_timeout_uninterruptible(HZ);
goto repeat;
@@ -3851,7 +3966,7 @@ repeat:
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
@@ -3881,34 +3996,23 @@ repeat:
}
}
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b,
- sector_t block, int count)
-{
- BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
ext4_group_t ngroups, i;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (!ext4_mballoc_debug ||
+ (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
- printk(KERN_ERR "EXT4-fs: Can't allocate:"
- " Allocation context details:\n");
- printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ " Allocation context details:");
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
ac->ac_status, ac->ac_flags);
- printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
- "best %lu/%lu/%lu@%lu cr %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
+ "best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
(unsigned long)ac->ac_o_ex.fe_len,
@@ -3922,9 +4026,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
- ac->ac_found);
- printk(KERN_ERR "EXT4-fs: groups: \n");
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -3977,7 +4080,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -3988,6 +4091,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;
}
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
/* don't use group allocation for large files */
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
@@ -4026,8 +4134,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
- len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4037,19 +4145,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- memset(ac, 0, sizeof(struct ext4_allocation_context));
- ac->ac_b_ex.fe_logical = ar->logical;
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll we work with a file or
@@ -4123,7 +4227,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
@@ -4161,7 +4265,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
- rcu_read_lock();
+ spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
@@ -4185,12 +4289,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
- rcu_read_unlock();
+ spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
- order, lg_prealloc_count);
+ order, lg_prealloc_count);
return;
}
return ;
@@ -4201,27 +4305,25 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
- pa->pa_pstart += ac->ac_b_ex.fe_len;
- pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
}
}
- if (ac->alloc_semp)
- up_read(ac->alloc_semp);
if (pa) {
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
- * doesn't grow big. We need to release
- * alloc_semp before calling ext4_mb_add_n_trim()
+ * doesn't grow big.
*/
if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
@@ -4271,38 +4373,53 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
- unsigned int reserv_blks = 0;
+ unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
/*
* For delayed allocation, we could skip the ENOSPC and
* EDQUOT check, as blocks and quotas have been already
* reserved when data being copied into pagecache.
*/
- if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
ar->flags |= EXT4_MB_DELALLOC_RESERVED;
else {
/* Without delayed allocation we need to verify
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
*/
- while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len &&
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
/* let others to free the space */
- yield();
+ cond_resched();
ar->len = ar->len >> 1;
}
if (!ar->len) {
*errp = -ENOSPC;
return 0;
}
- reserv_blks = ar->len;
- while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
- ar->flags |= EXT4_MB_HINT_NOPREALLOC;
- ar->len--;
+ reserv_clstrs = ar->len;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
}
inquota = ar->len;
if (ar->len == 0) {
@@ -4311,7 +4428,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
@@ -4332,17 +4449,22 @@ repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
if (*errp)
- goto errout;
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp == -EAGAIN) {
/*
* drop the reference that we took
@@ -4354,10 +4476,10 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
- } else if (*errp)
- errout:
+ } else if (*errp) {
ext4_discard_allocated_blocks(ac);
- else {
+ goto errout;
+ } else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
@@ -4368,6 +4490,7 @@ repeat:
*errp = -ENOSPC;
}
+errout:
if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
@@ -4378,12 +4501,13 @@ out:
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
- dquot_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
- if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ if (!ext4_test_inode_state(ar->inode,
+ EXT4_STATE_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
}
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4399,9 +4523,9 @@ out:
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if ((entry1->t_tid == entry2->t_tid) &&
- (entry1->group == entry2->group) &&
- ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
return 1;
return 0;
}
@@ -4411,7 +4535,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
ext4_group_t group = e4b->bd_group;
- ext4_grpblk_t block;
+ ext4_grpblk_t cluster;
struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
@@ -4423,8 +4547,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- new_node = &new_entry->node;
- block = new_entry->start_blk;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
if (!*n) {
/* first free block exent. We need to
@@ -4437,14 +4561,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
while (*n) {
parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, node);
- if (block < entry->start_blk)
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
n = &(*n)->rb_left;
- else if (block >= (entry->start_blk + entry->count))
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
- ext4_group_first_block_no(sb, group) + block,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
return 0;
}
@@ -4456,34 +4581,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(entry, new_entry)) {
- new_entry->start_blk = entry->start_blk;
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
node = rb_next(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(new_entry, entry)) {
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
/* Add the extent to transaction's private list */
- spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
- spin_unlock(&sbi->s_md_lock);
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}
@@ -4493,7 +4613,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* @inode: inode
* @block: start physical block to free
* @count: number of blocks to count
- * @metadata: Are these metadata blocks
+ * @flags: flags used by ext4_free_blocks
*/
void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
@@ -4502,16 +4622,18 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
- unsigned long freed = 0;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
+ unsigned int count_clusters;
int err = 0;
int ret;
+ might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -4537,10 +4659,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
@@ -4557,18 +4680,56 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!ext4_should_writeback_data(inode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
+ count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4583,9 +4744,9 @@ do_more:
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
+ EXT4_SB(sb)->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ EXT4_SB(sb)->s_itb_per_group)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
@@ -4610,11 +4771,11 @@ do_more:
#ifdef AGGRESSIVE_CHECK
{
int i;
- for (i = 0; i < count; i++)
+ for (i = 0; i < count_clusters; i++)
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
@@ -4626,40 +4787,74 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
- new_entry->start_blk = bit;
- new_entry->group = block_group;
- new_entry->count = count;
- new_entry->t_tid = handle->h_transaction->t_tid;
+ retry:
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+ if (!new_entry) {
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
/* need to update group_info->bb_free and bitmap
* with group lock held. generate_buddy look at
* them with group lock_held
*/
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(inode, &e4b, bit, count);
- ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_blks_count(sb, gdp) + count;
- ext4_free_blks_set(sb, gdp, ret);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
- ext4_mb_unload_buddy(&e4b);
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
- freed += count;
+ ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4677,16 +4872,147 @@ do_more:
put_bh(bitmap_bh);
goto do_more;
}
- ext4_mark_super_dirty(sb);
error_return:
- if (freed)
- dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
}
/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physical block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ if (count == 0)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too much blocks added to group %u\n",
+ block_group);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, blocks_freed));
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/**
* ext4_trim_extent -- function to TRIM one single free extent in the group
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
@@ -4699,11 +5025,15 @@ error_return:
* be called with under the group lock.
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
int ret = 0;
+ trace_ext4_trim_extent(sb, group, start, count);
+
assert_spin_locked(ext4_group_lock_ptr(sb, group));
ex.fe_start = start;
@@ -4716,11 +5046,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
-
ret = ext4_issue_discard(sb, group, start, count);
- if (ret)
- ext4_std_error(sb, ret);
-
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -4729,7 +5055,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
/**
* ext4_trim_all_free -- function to trim all free space in alloc. group
* @sb: super block for file system
- * @e4b: ext4 buddy
+ * @group: group to be trimmed
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
@@ -4744,35 +5070,49 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
* bitmap. Then issue a TRIM command on this extent and free the extent in
* the group buddy bitmap. This is done until whole group is scanned.
*/
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
- ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
{
void *bitmap;
- ext4_grpblk_t next, count = 0;
- ext4_group_t group;
+ ext4_grpblk_t next, count = 0, free_count = 0;
+ struct ext4_buddy e4b;
int ret = 0;
- BUG_ON(e4b == NULL);
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
- bitmap = e4b->bd_bitmap;
- group = e4b->bd_group;
- start = (e4b->bd_info->bb_first_free > start) ?
- e4b->bd_info->bb_first_free : start;
ext4_lock_group(sb, group);
+ if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+ minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+ goto out;
- while (start < max) {
- start = mb_find_next_zero_bit(bitmap, max, start);
- if (start >= max)
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
break;
- next = mb_find_next_bit(bitmap, max, start);
+ next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start,
- next - start, group, e4b);
- if (ret < 0)
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
break;
+ ret = 0;
count += next - start;
}
+ free_count += next - start;
start = next + 1;
if (fatal_signal_pending(current)) {
@@ -4786,18 +5126,22 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
ext4_lock_group(sb, group);
}
- if ((e4b->bd_info->bb_free - count) < minblocks)
+ if ((e4b.bd_info->bb_free - free_count) < minblocks)
break;
}
+
+ if (!ret) {
+ ret = count;
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
+out:
ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- if (ret < 0)
- count = ret;
-
- return count;
+ return ret;
}
/**
@@ -4814,59 +5158,79 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct ext4_buddy e4b;
- ext4_group_t first_group, last_group;
- ext4_group_t group, ngroups = ext4_get_groups_count(sb);
- ext4_grpblk_t cnt = 0, first_block, last_block;
- uint64_t start, len, minlen, trimmed;
+ struct ext4_group_info *grp;
+ ext4_group_t group, first_group, last_group;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+ uint64_t start, end, minlen, trimmed = 0;
+ ext4_fsblk_t first_data_blk =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
- len = range->len >> sb->s_blocksize_bits;
- minlen = range->minlen >> sb->s_blocksize_bits;
- trimmed = 0;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
- if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
- /* Determine first and last group to examine based on start and len */
+ /* Determine first and last group to examine based on start and end */
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
- &first_group, &first_block);
- ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
- &last_group, &last_block);
- last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_block = EXT4_BLOCKS_PER_GROUP(sb);
+ &first_group, &first_cluster);
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+ &last_group, &last_cluster);
- if (first_group > last_group)
- return -EINVAL;
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_error(sb, "Error in loading buddy "
- "information for %u", group);
- break;
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
}
- if (len >= EXT4_BLOCKS_PER_GROUP(sb))
- len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
- else
- last_block = len;
+ /*
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_cluster;
- if (e4b.bd_info->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, &e4b, first_block,
- last_block, minlen);
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ end, minlen);
if (cnt < 0) {
ret = cnt;
- ext4_mb_unload_buddy(&e4b);
break;
}
+ trimmed += cnt;
}
- ext4_mb_unload_buddy(&e4b);
- trimmed += cnt;
- first_block = 0;
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
+ first_cluster = 0;
}
- range->len = trimmed * sb->s_blocksize;
+ if (!ret)
+ atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f..d634e183b4d 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,18 +37,18 @@
/*
*/
#ifdef CONFIG_EXT4_DEBUG
-extern u8 mb_enable_debug;
+extern ushort ext4_mballoc_debug;
#define mb_debug(n, fmt, a...) \
do { \
- if ((n) <= mb_enable_debug) { \
+ if ((n) <= ext4_mballoc_debug) { \
printk(KERN_DEBUG "(%s, %d): %s: ", \
__FILE__, __LINE__, __func__); \
printk(fmt, ## a); \
} \
} while (0)
#else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
-
-/*
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
@@ -96,21 +91,23 @@ extern u8 mb_enable_debug;
struct ext4_free_data {
- /* this links the free block information from group_info */
- struct rb_node node;
+ /* MUST be the first member */
+ struct ext4_journal_cb_entry efd_jce;
+
+ /* ext4_free_data private data starts from here */
- /* this links the free block information from ext4_sb_info */
- struct list_head list;
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;
/* group which free block extent belongs */
- ext4_group_t group;
+ ext4_group_t efd_group;
/* free block extent */
- ext4_grpblk_t start_blk;
- ext4_grpblk_t count;
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;
/* transaction which freed this extent */
- tid_t t_tid;
+ tid_t efd_tid;
};
struct ext4_prealloc_space {
@@ -139,9 +136,9 @@ enum {
struct ext4_free_extent {
ext4_lblk_t fe_logical;
- ext4_grpblk_t fe_start;
+ ext4_grpblk_t fe_start; /* In cluster units */
ext4_group_t fe_group;
- ext4_grpblk_t fe_len;
+ ext4_grpblk_t fe_len; /* In cluster units */
};
/*
@@ -169,17 +166,15 @@ struct ext4_allocation_context {
/* original request */
struct ext4_free_extent ac_o_ex;
- /* goal request (after normalization) */
+ /* goal request (normalized ac_o_ex) */
struct ext4_free_extent ac_g_ex;
/* the best found extent */
struct ext4_free_extent ac_b_ex;
- /* copy of the bext found extent taken before preallocation efforts */
+ /* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
- /* number of iterations done. we have to track to limit searching */
- unsigned long ac_ex_scanned;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_tail;
@@ -187,17 +182,11 @@ struct ext4_allocation_context {
__u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
- __u8 ac_repeats;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
- /*
- * pointer to the held semaphore upon successful
- * block allocation
- */
- struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -215,14 +204,12 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
- struct rw_semaphore *alloc_semp;
};
-#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
- return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+ return ext4_group_first_block_no(sb, fex->fe_group) +
+ (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b72..ec092437d3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,7 +12,6 @@
*
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
@@ -21,13 +20,13 @@
* The contiguous blocks details which can be
* represented by a single extent
*/
-struct list_blocks_struct {
- ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+ ext4_lblk_t first_block, last_block, curr_block;
ext4_fsblk_t first_pblock, last_pblock;
};
static int finish_range(handle_t *handle, struct inode *inode,
- struct list_blocks_struct *lb)
+ struct migrate_struct *lb)
{
int retval = 0, needed;
@@ -40,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
@@ -87,8 +86,7 @@ err_out:
}
static int update_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t blk_num,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock, struct migrate_struct *lb)
{
int retval;
/*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
if (lb->first_pblock &&
(lb->last_pblock+1 == pblock) &&
- (lb->last_block+1 == blk_num)) {
+ (lb->last_block+1 == lb->curr_block)) {
lb->last_pblock = pblock;
- lb->last_block = blk_num;
+ lb->last_block = lb->curr_block;
+ lb->curr_block++;
return 0;
}
/*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
retval = finish_range(handle, inode, lb);
lb->first_pblock = lb->last_pblock = pblock;
- lb->first_block = lb->last_block = blk_num;
-
+ lb->first_block = lb->last_block = lb->curr_block;
+ lb->curr_block++;
return retval;
}
static int update_ind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries;
- return 0;
- }
-
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
- for (i = 0; i < max_entries; i++, blk_count++) {
+ for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
+ } else {
+ lb->curr_block++;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_dind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_ind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
/* Only update the file block number */
- blk_count += max_entries;
+ lb->curr_block += max_entries;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_tind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_dind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
- } else
+ } else {
/* Only update the file block number */
- blk_count += max_entries * max_entries;
+ lb->curr_block += max_entries * max_entries;
+ }
}
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
@@ -263,7 +235,7 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, 0,
+ ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +243,7 @@ static int free_dind_blocks(handle_t *handle,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
return 0;
@@ -302,7 +274,7 @@ static int free_tind_blocks(handle_t *handle,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
return 0;
@@ -315,7 +287,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, 0,
+ ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -376,7 +348,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* We have the extent map build with the tmp inode.
* Now copy the i_data across
*/
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/*
@@ -428,7 +400,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, 0, block, 1,
+ ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return retval;
}
@@ -454,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
-
}
int ext4_ext_migrate(struct inode *inode)
@@ -462,12 +433,12 @@ int ext4_ext_migrate(struct inode *inode)
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
- ext4_lblk_t blk_count = 0;
struct ext4_inode_info *ei;
struct inode *tmp_inode = NULL;
- struct list_blocks_struct lb;
+ struct migrate_struct lb;
unsigned long max_entries;
__u32 goal;
+ uid_t owner[2];
/*
* If the filesystem does not support extents, or the inode
@@ -484,21 +455,26 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
- + 1);
+ /*
+ * Worst case we can touch the allocation bitmaps, a bgd
+ * block, and a block to link in the orphan list. We do need
+ * need to worry about credits for modifying the quota inode.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+ 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
return retval;
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ owner[0] = i_uid_read(inode);
+ owner[1] = i_gid_read(inode);
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
- S_IFREG, 0, goal);
+ S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
- retval = -ENOMEM;
+ retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
return retval;
}
@@ -507,7 +483,7 @@ int ext4_ext_migrate(struct inode *inode)
* Set the i_nlink to zero so it will be deleted later
* when we drop inode reference.
*/
- tmp_inode->i_nlink = 0;
+ clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -517,8 +493,8 @@ int ext4_ext_migrate(struct inode *inode)
* start with one credit accounted for
* superblock modification.
*
- * For the tmp_inode we already have commited the
- * trascation that created the inode. Later as and
+ * For the tmp_inode we already have committed the
+ * transaction that created the inode. Later as and
* when we add extents we extent the journal
*/
/*
@@ -529,11 +505,11 @@ int ext4_ext_migrate(struct inode *inode)
* with i_data_sem held to prevent racing with block
* allocation.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
/*
* It is impossible to update on-disk structures without
@@ -551,35 +527,32 @@ int ext4_ext_migrate(struct inode *inode)
/* 32 bit block address 4 bytes */
max_entries = inode->i_sb->s_blocksize >> 2;
- for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+ for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[i]),
- blk_count, &lb);
+ le32_to_cpu(i_data[i]), &lb);
if (retval)
goto err_out;
- }
+ } else
+ lb.curr_block++;
}
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_IND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries;
+ lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries * max_entries;
+ lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
if (retval)
goto err_out;
}
@@ -632,3 +605,64 @@ out:
return retval;
}
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent *ex;
+ unsigned int i, len;
+ ext4_fsblk_t blk;
+ handle_t *handle;
+ int ret;
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return -EOPNOTSUPP;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_check_inode(inode);
+ if (ret)
+ goto errout;
+
+ eh = ext_inode_hdr(inode);
+ ex = EXT_FIRST_EXTENT(eh);
+ if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+ eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ if (eh->eh_entries == 0)
+ blk = len = 0;
+ else {
+ len = le16_to_cpu(ex->ee_len);
+ blk = ext4_ext_pblock(ex);
+ if (len > EXT4_NDIR_BLOCKS) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ for (i=0; i < len; i++)
+ ei->i_data[i] = cpu_to_le32(blk++);
+ ext4_mark_inode_dirty(handle, inode);
+errout:
+ ext4_journal_stop(handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 00000000000..32bce844c2e
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,395 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/* Checksumming functions */
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct mmp_struct, mmp_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+ struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
+ ext4_mmp_csum_set(sb, mmp);
+ mark_buffer_dirty(bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ sb_end_write(sb);
+ if (unlikely(!buffer_uptodate(bh)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh)
+ *bh = sb_getblk(sb, mmp_block);
+ if (!*bh)
+ return -ENOMEM;
+ if (*bh) {
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ }
+ if (unlikely(!*bh)) {
+ ext4_warning(sb, "Error while reading MMP block %llu",
+ mmp_block);
+ return -EIO;
+ }
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+ !ext4_mmp_csum_verify(sb, mmp))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update "
+ "node: %s, last update device: %s\n",
+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+ mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop()) {
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(sb, bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval) {
+ if ((failed_writes % 60) == 0)
+ ext4_error(sb, "Error writing to MMP block");
+ failed_writes++;
+ }
+
+ if (!(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_MMP)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_warning(sb, "kmmpd being stopped since filesystem "
+ "has been remounted as readonly.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error(sb, "error reading MMP data: %d",
+ retval);
+
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error(sb, "abort");
+ goto failed;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MAX_CHECK_INTERVAL),
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+
+ retval = write_mmp_block(sb, bh);
+
+failed:
+ kfree(data);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ u32 new_seq;
+
+ do {
+ new_seq = prandom_u32();
+ } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+ return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ struct mmpd_data *mmpd_data;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+ mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ seq = mmp_new_seq();
+ mmp->mmp_seq = cpu_to_le32(seq);
+
+ retval = write_mmp_block(sb, bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ if (!mmpd_data) {
+ ext4_warning(sb, "not enough memory for mmpd_data");
+ goto failed;
+ }
+ mmpd_data->sb = sb;
+ mmpd_data->bh = bh;
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ bdevname(bh->b_bdev,
+ mmp->mmp_bdevname));
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ kfree(mmpd_data);
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f1..2484c7ec6a7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,8 +17,8 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
-#include "ext4_extents.h"
#include "ext4.h"
+#include "ext4_extents.h"
/**
* get_ext_path - Find an extent path for designated logical block number.
@@ -32,16 +32,18 @@
*/
static inline int
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **path)
+ struct ext4_ext_path **orig_path)
{
int ret = 0;
+ struct ext4_ext_path *path;
- *path = ext4_ext_find_extent(inode, lblock, *path);
- if (IS_ERR(*path)) {
- ret = PTR_ERR(*path);
- *path = NULL;
- } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ ret = PTR_ERR(path);
+ else if (path[ext_depth(inode)].p_ext == NULL)
ret = -ENODATA;
+ else
+ *orig_path = path;
return ret;
}
@@ -55,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
static void
copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
{
- if (ext4_ext_is_uninitialized(src))
- ext4_ext_mark_uninitialized(dest);
+ if (ext4_ext_is_unwritten(src))
+ ext4_ext_mark_unwritten(dest);
else
dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
}
@@ -74,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
* ext4_ext_path structure refers to the last extent, or a negative error
* value on failure.
*/
-static int
+int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
@@ -142,66 +144,34 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_check_null_inode - NULL check for two inodes
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ * of i_data_sem
*
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ * Acquire write lock of i_data_sem of the two inodes
*/
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
- const char *function, unsigned int line)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
- int ret = 0;
-
- if (inode1 == NULL) {
- __ext4_error(inode2->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 NULL inode2 %lu", inode2->i_ino);
- ret = -EIO;
- } else if (inode2 == NULL) {
- __ext4_error(inode1->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 %lu inode2 NULL", inode1->i_ino);
- ret = -EIO;
- }
- return ret;
-}
-
-/**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
- *
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
- */
-static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
-{
- struct inode *first = orig_inode, *second = donor_inode;
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
}
-
- down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -421,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
if (depth) {
/* Register to journal */
+ BUFFER_TRACE(orig_path->p_bh, "get_write_access");
ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
if (ret)
return ret;
@@ -439,18 +410,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
end_ext, eh, range_to_move);
- if (depth) {
- ret = ext4_handle_dirty_metadata(handle, orig_inode,
- orig_path->p_bh);
- if (ret)
- return ret;
- } else {
- ret = ext4_mark_inode_dirty(handle, orig_inode);
- if (ret < 0)
- return ret;
- }
-
- return 0;
+ return ext4_ext_dirty(handle, orig_inode, orig_path);
}
/**
@@ -605,9 +565,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
- tmp_dext->ee_block =
- cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
- tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+ le32_add_cpu(&tmp_dext->ee_block, diff);
+ le16_add_cpu(&tmp_dext->ee_len, -diff);
if (max_count < ext4_ext_get_actual_len(tmp_dext))
tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -630,6 +589,44 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
}
/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode: inode in question
+ * @from: block offset of inode
+ * @count: block count to be checked
+ * @unwritten: extents expected to be unwritten
+ * @err: pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+ int unwritten, int *err)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ext;
+ int ret = 0;
+ ext4_lblk_t last = from + count;
+ while (from < last) {
+ *err = get_ext_path(inode, from, &path);
+ if (*err)
+ goto out;
+ ext = path[ext_depth(inode)].p_ext;
+ if (unwritten != ext4_ext_is_unwritten(ext))
+ goto out;
+ from += ext4_ext_get_actual_len(ext);
+ ext4_ext_drop_refs(path);
+ }
+ ret = 1;
+out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return ret;
+}
+
+/**
* mext_replace_branches - Replace original extents with new extents
*
* @handle: journal handle
@@ -664,8 +661,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
int replaced_count = 0;
int dext_alen;
- /* Protect extent trees against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
+ *err = ext4_es_remove_extent(orig_inode, from, count);
+ if (*err)
+ goto out;
+
+ *err = ext4_es_remove_extent(donor_inode, from, count);
+ if (*err)
+ goto out;
/* Get the original extent for the block "orig_off" */
*err = get_ext_path(orig_inode, orig_off, &orig_path);
@@ -682,6 +684,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
+ if (unlikely(!dext))
+ goto missing_donor_extent;
tmp_dext = *dext;
*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
@@ -692,7 +696,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- if (!dext) {
+ if (unlikely(!dext)) {
+ missing_donor_extent:
EXT4_ERROR_INODE(donor_inode,
"The extent for donor must be found");
*err = -EIO;
@@ -724,6 +729,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
donor_off += dext_alen;
orig_off += dext_alen;
+ BUG_ON(replaced_count > count);
/* Already moved the expected blocks */
if (replaced_count >= count)
break;
@@ -762,12 +768,123 @@ out:
kfree(donor_path);
}
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ return replaced_count;
+}
- double_up_write_data_sem(orig_inode, donor_inode);
+/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ * @index: page index
+ * @page: result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index, struct page *page[2])
+{
+ struct address_space *mapping[2];
+ unsigned fl = AOP_FLAG_NOFS;
- return replaced_count;
+ BUG_ON(!inode1 || !inode2);
+ if (inode1 < inode2) {
+ mapping[0] = inode1->i_mapping;
+ mapping[1] = inode2->i_mapping;
+ } else {
+ mapping[0] = inode2->i_mapping;
+ mapping[1] = inode1->i_mapping;
+ }
+
+ page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+ if (!page[0])
+ return -ENOMEM;
+
+ page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+ if (!page[1]) {
+ unlock_page(page[0]);
+ page_cache_release(page[0]);
+ return -ENOMEM;
+ }
+ /*
+ * grab_cache_page_write_begin() may not wait on page's writeback if
+ * BDI not demand that. But it is reasonable to be very conservative
+ * here and explicitly wait on page's writeback
+ */
+ wait_on_page_writeback(page[0]);
+ wait_on_page_writeback(page[1]);
+ if (inode1 > inode2) {
+ struct page *tmp;
+ tmp = page[0];
+ page[0] = page[1];
+ page[1] = tmp;
+ }
+ return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ sector_t block;
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ unsigned int blocksize, block_start, block_end;
+ int i, err, nr = 0, partial = 0;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (PageUptodate(page))
+ return 0;
+
+ blocksize = 1 << inode->i_blkbits;
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ head = page_buffers(page);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ continue;
+ }
+ if (buffer_uptodate(bh))
+ continue;
+ if (!buffer_mapped(bh)) {
+ err = ext4_get_block(inode, block, bh, 0);
+ if (err) {
+ SetPageError(page);
+ return err;
+ }
+ if (!buffer_mapped(bh)) {
+ zero_user(page, block_start, blocksize);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+ BUG_ON(nr >= MAX_BUF_PER_PAGE);
+ arr[nr++] = bh;
+ }
+ /* No io required */
+ if (!nr)
+ goto out;
+
+ for (i = 0; i < nr; i++) {
+ bh = arr[i];
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
+ if (err)
+ return err;
+ }
+ }
+out:
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
}
/**
@@ -778,7 +895,7 @@ out:
* @orig_page_offset: page index on original file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
- * @uninit: orig extent is uninitialized or not
+ * @unwritten: orig extent is unwritten or not
* @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
@@ -789,31 +906,28 @@ out:
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit, int *err)
+ int block_len_in_page, int unwritten, int *err)
{
- struct inode *orig_inode = o_filp->f_dentry->d_inode;
- struct address_space *mapping = orig_inode->i_mapping;
- struct buffer_head *bh;
- struct page *page = NULL;
- const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *orig_inode = file_inode(o_filp);
+ struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
- long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
- void *fsdata;
- int i, jblocks;
- int err2 = 0;
+ int err2, jblocks, retries = 0;
int replaced_count = 0;
+ int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
* It needs twice the amount of ordinary journal buffers because
* inode and donor_inode may change each different metadata blocks.
*/
+again:
+ *err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
- handle = ext4_journal_start(orig_inode, jblocks);
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
return 0;
@@ -825,21 +939,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- /*
- * If orig extent is uninitialized one,
- * it's not necessary force the page into memory
- * and then force it to be written out again.
- * Just swap data blocks between orig and donor.
- */
- if (uninit) {
- replaced_count = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page, err);
- goto out2;
- }
-
- offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -859,76 +958,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
replaced_size = data_size;
- *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
- &page, &fsdata);
+ *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+ pagep);
if (unlikely(*err < 0))
- goto out;
-
- if (!PageUptodate(page)) {
- mapping->a_ops->readpage(o_filp, page);
- lock_page(page);
- }
-
+ goto stop_journal;
/*
- * try_to_release_page() doesn't call releasepage in writeback mode.
- * We should care about the order of writing to the same file
- * by multiple move extent processes.
- * It needs to call wait_on_page_writeback() to wait for the
- * writeback of the page.
+ * If orig extent was unwritten it can become initialized
+ * at any time after i_data_sem was dropped, in order to
+ * serialize with delalloc we have recheck extent while we
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
*/
- if (PageWriteback(page))
- wait_on_page_writeback(page);
+ if (unwritten) {
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+ * fallback to data copying */
+ unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
- /* Release old bh and drop refs */
- try_to_release_page(page, 0);
+ unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ if (!unwritten) {
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+ if ((page_has_private(pagep[0]) &&
+ !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) &&
+ !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+ drop_data_sem:
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto unlock_pages;
+ }
+data_copy:
+ *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ if (*err)
+ goto unlock_pages;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+ if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto unlock_pages;
+ }
replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page,
- &err2);
- if (err2) {
+ orig_blk_offset,
+ block_len_in_page, err);
+ if (*err) {
if (replaced_count) {
block_len_in_page = replaced_count;
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
- goto out;
- }
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
-
- bh = page_buffers(page);
- for (i = 0; i < data_offset_in_page; i++)
- bh = bh->b_this_page;
-
- for (i = 0; i < block_len_in_page; i++) {
- *err = ext4_get_block(orig_inode,
- (sector_t)(orig_blk_offset + i), bh, 0);
- if (*err < 0)
- goto out;
-
- if (bh->b_this_page != NULL)
- bh = bh->b_this_page;
+ goto unlock_pages;
}
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+ *err = __block_write_begin(pagep[0], from, replaced_size,
+ ext4_get_block);
+ if (!*err)
+ *err = block_commit_write(pagep[0], from, from + replaced_size);
- *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
- page, fsdata);
- page = NULL;
-
-out:
- if (unlikely(page)) {
- if (PageLocked(page))
- unlock_page(page);
- page_cache_release(page);
- ext4_journal_stop(handle);
- }
-out2:
+ if (unlikely(*err < 0))
+ goto repair_branches;
+
+ /* Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss */
+ *err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+ unlock_page(pagep[0]);
+ page_cache_release(pagep[0]);
+ unlock_page(pagep[1]);
+ page_cache_release(pagep[1]);
+stop_journal:
ext4_journal_stop(handle);
-
- if (err2)
- *err = err2;
-
+ /* Buffer was busy because probably is pinned to journal transaction,
+ * force transaction commit may help to free it. */
+ if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+ &retries))
+ goto again;
return replaced_count;
+
+repair_branches:
+ /*
+ * This should never ever happen!
+ * Extents are swapped already, but we are not able to copy data.
+ * Try to swap extents to it's original places
+ */
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+ orig_blk_offset,
+ block_len_in_page, &err2);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ if (replaced_count != block_len_in_page) {
+ EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+ "Unable to copy data block,"
+ " data will be lost.");
+ *err = -EIO;
+ }
+ replaced_count = 0;
+ goto unlock_pages;
}
/**
@@ -971,14 +1114,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* Files should be in the same ext4 FS */
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1003,12 +1138,11 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if ((orig_start > EXT_MAX_BLOCK) ||
- (donor_start > EXT_MAX_BLOCK) ||
- (*len > EXT_MAX_BLOCK) ||
- (orig_start + *len > EXT_MAX_BLOCK)) {
+ if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (*len > EXT_MAX_BLOCKS) ||
+ (orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
- "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -1069,73 +1203,6 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode structure
- * @inode2: the inode structure
- *
- * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1 == inode2) {
- mutex_lock(&inode1->i_mutex);
- goto out;
- }
-
- if (inode1->i_ino < inode2->i_ino) {
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
- }
-
-out:
- return ret;
-}
-
-/**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode that is released first
- * @inode2: the inode that is released second
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-
-static int
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1)
- mutex_unlock(&inode1->i_mutex);
-
- if (inode2 && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
-
-out:
- return ret;
-}
-
-/**
* ext4_move_extents - Exchange the specified range of a file
*
* @o_filp: file structure of the original file
@@ -1181,24 +1248,31 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 orig_start, __u64 donor_start, __u64 len,
__u64 *moved_len)
{
- struct inode *orig_inode = o_filp->f_dentry->d_inode;
- struct inode *donor_inode = d_filp->f_dentry->d_inode;
+ struct inode *orig_inode = file_inode(o_filp);
+ struct inode *donor_inode = file_inode(d_filp);
struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
ext4_lblk_t block_start = orig_start;
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret1, ret2, depth, last_extent = 0;
+ int ret, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
- int uninit;
+ int unwritten;
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
+ "be same inode [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -1210,18 +1284,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
-
+ /* TODO: This is non obvious task to swap blocks for inodes with full
+ jornaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ return -EINVAL;
+ }
/* Protect orig and donor inodes against a truncate */
- ret1 = mext_inode_double_lock(orig_inode, donor_inode);
- if (ret1 < 0)
- return ret1;
+ lock_two_nondirectories(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
- if (ret1)
+ if (ret)
goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1229,13 +1312,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (file_end < block_end)
len -= block_end - file_end;
- ret1 = get_ext_path(orig_inode, block_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret)
goto out;
/* Get path structure to check the hole */
- ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret)
goto out;
depth = ext_depth(orig_inode);
@@ -1254,13 +1337,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1274,7 +1357,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret1 = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1294,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1309,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
!last_extent)
continue;
- /* Is original extent is uninitialized */
- uninit = ext4_ext_is_uninitialized(ext_prev);
+ /* Is original extent is unwritten */
+ unwritten = ext4_ext_is_unwritten(ext_prev);
data_offset_in_page = seq_start % blocks_per_page;
@@ -1341,7 +1424,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
- double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
@@ -1350,19 +1433,19 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit,
- &ret1);
+ block_len_in_page,
+ unwritten, &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- if (ret1 < 0)
+ if (ret < 0)
break;
if (*moved_len > len) {
EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
- ret1 = -EIO;
+ ret = -EIO;
break;
}
@@ -1375,23 +1458,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
- double_down_write_data_sem(orig_inode, donor_inode);
- if (ret1 < 0)
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret < 0)
break;
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1413,13 +1496,10 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
- double_up_write_data_sem(orig_inode, donor_inode);
- ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
+ unlock_two_nondirectories(orig_inode, donor_inode);
- if (ret1)
- return ret1;
- else if (ret2)
- return ret2;
-
- return 0;
+ return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099..3520ab8a663 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,30 +40,118 @@
#include "xattr.h"
#include "acl.h"
+#include <trace/events/ext4.h>
/*
* define how far ahead to read directories while searching them.
*/
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
- ext4_lblk_t *block, int *err)
+ ext4_lblk_t *block)
{
struct buffer_head *bh;
+ int err = 0;
+
+ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+ ((inode->i_size >> 10) >=
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+ return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext4_bread(handle, inode, *block, 1, err);
- if (bh) {
- inode->i_size += inode->i_sb->s_blocksize;
- EXT4_I(inode)->i_disksize = inode->i_size;
- *err = ext4_journal_get_write_access(handle, bh);
- if (*err) {
+ bh = ext4_bread(handle, inode, *block, 1, &err);
+ if (!bh)
+ return ERR_PTR(err);
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
+ }
+ return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
+
+typedef enum {
+ EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+ __ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+ ext4_lblk_t block,
+ dirblock_type_t type,
+ unsigned int line)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry *dirent;
+ int err = 0, is_dx_block = 0;
+
+ bh = ext4_bread(NULL, inode, block, 0, &err);
+ if (!bh) {
+ if (err == 0) {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory hole found");
+ return ERR_PTR(-EIO);
+ }
+ __ext4_warning(inode->i_sb, __func__, line,
+ "error reading directory block "
+ "(ino %lu, block %lu)", inode->i_ino,
+ (unsigned long) block);
+ return ERR_PTR(err);
+ }
+ dirent = (struct ext4_dir_entry *) bh->b_data;
+ /* Determine whether or not we have an index block */
+ if (is_dx(inode)) {
+ if (block == 0)
+ is_dx_block = 1;
+ else if (ext4_rec_len_from_disk(dirent->rec_len,
+ inode->i_sb->s_blocksize) ==
+ inode->i_sb->s_blocksize)
+ is_dx_block = 1;
+ }
+ if (!is_dx_block && type == INDEX) {
+ ext4_error_inode(inode, __func__, line, block,
+ "directory leaf block found instead of index block");
+ return ERR_PTR(-EIO);
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ buffer_verified(bh))
+ return bh;
+
+ /*
+ * An empty leaf block can get mistaken for a index block; for
+ * this reason, we can only check the index checksum when the
+ * caller is sure it should be an index block.
+ */
+ if (is_dx_block && type == INDEX) {
+ if (ext4_dx_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory index failed checksum");
brelse(bh);
- bh = NULL;
+ return ERR_PTR(-EIO);
+ }
+ }
+ if (!is_dx_block) {
+ if (ext4_dirent_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory block failed checksum");
+ brelse(bh);
+ return ERR_PTR(-EIO);
}
}
return bh;
@@ -144,6 +232,14 @@ struct dx_map_entry
u16 size;
};
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+ u32 dt_reserved;
+ __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -179,6 +275,228 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
+/* checksumming functions */
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize)
+{
+ memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+ t->det_rec_len = ext4_rec_len_to_disk(
+ sizeof(struct ext4_dir_entry_tail), blocksize);
+ t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+ struct ext4_dir_entry *de)
+{
+ struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+ struct ext4_dir_entry *d, *top;
+
+ d = de;
+ top = (struct ext4_dir_entry *)(((void *)de) +
+ (EXT4_BLOCK_SIZE(inode->i_sb) -
+ sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && d->rec_len)
+ d = (struct ext4_dir_entry *)(((void *)d) +
+ le16_to_cpu(d->rec_len));
+
+ if (d != top)
+ return NULL;
+
+ t = (struct ext4_dir_entry_tail *)d;
+#else
+ t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+ if (t->det_reserved_zero1 ||
+ le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ t->det_reserved_zero2 ||
+ t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+ return NULL;
+
+ return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+ struct ext4_dir_entry *dirent, int size)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ return cpu_to_le32(csum);
+}
+
+static void warn_no_space_for_csum(struct inode *inode)
+{
+ ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+ "checksum. Please run e2fsck -D.", inode->i_ino);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ warn_no_space_for_csum(inode);
+ return 0;
+ }
+
+ if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent))
+ return 0;
+
+ return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ warn_no_space_for_csum(inode);
+ return;
+ }
+
+ t->det_checksum = ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent);
+}
+
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+ struct ext4_dir_entry *dirent,
+ int *offset)
+{
+ struct ext4_dir_entry *dp;
+ struct dx_root_info *root;
+ int count_offset;
+
+ if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ count_offset = 8;
+ else if (le16_to_cpu(dirent->rec_len) == 12) {
+ dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+ if (le16_to_cpu(dp->rec_len) !=
+ EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ return NULL;
+ root = (struct dx_root_info *)(((void *)dp + 12));
+ if (root->reserved_zero ||
+ root->info_length != sizeof(struct dx_root_info))
+ return NULL;
+ count_offset = 32;
+ } else
+ return NULL;
+
+ if (offset)
+ *offset = count_offset;
+ return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+ int count_offset, int count, struct dx_tail *t)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum;
+ __le32 save_csum;
+ int size;
+
+ size = count_offset + (count * sizeof(struct dx_entry));
+ save_csum = t->dt_checksum;
+ t->dt_checksum = 0;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+ t->dt_checksum = save_csum;
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return 1;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ warn_no_space_for_csum(inode);
+ return 1;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+ count, t))
+ return 0;
+ return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ warn_no_space_for_csum(inode);
+ return;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
/*
* p is at least 6 bytes before the end of page
*/
@@ -238,12 +556,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -288,7 +614,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
while (len--) printk("%c", *name++);
ext4fs_dirhash(de->name, de->name_len, &h);
printk(":%x.%u ", h.hash,
- ((char *) de - base));
+ (unsigned) ((char *) de - base));
}
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
@@ -353,8 +679,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -446,9 +775,13 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail2;
- at = entries = ((struct dx_node *) bh->b_data)->entries;
+ }
+ entries = ((struct dx_node *) bh->b_data)->entries;
+
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
"dx entry: limit != node limit");
@@ -467,7 +800,7 @@ fail2:
fail:
if (*err == ERR_BAD_DX_DIR)
ext4_warning(dir->i_sb,
- "Corrupt dir inode %ld, running e2fsck is "
+ "Corrupt dir inode %lu, running e2fsck is "
"recommended.", dir->i_ino);
return NULL;
}
@@ -506,7 +839,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
{
struct dx_frame *p;
struct buffer_head *bh;
- int err, num_frames = 0;
+ int num_frames = 0;
__u32 bhash;
p = frame;
@@ -545,9 +878,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
- return err; /* Failure */
+ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
p++;
brelse(p->bh);
p->bh = bh;
@@ -569,26 +902,25 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
- return err;
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
- if (!ext4_check_dir_entry(dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
- +((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse(bh);
- return count;
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size,
+ (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ + ((char *)de - bh->b_data))) {
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -631,13 +963,24 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
- dir = dir_file->f_path.dentry->d_inode;
+ dir = file_inode(dir_file);
if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ count = htree_inlinedir_to_tree(dir_file, dir, 0,
+ &hinfo, start_hash,
+ start_minor_hash,
+ &has_inline_data);
+ if (has_inline_data) {
+ *next_hash = ~0;
+ return count;
+ }
+ }
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
@@ -699,6 +1042,15 @@ errout:
return (err);
}
+static inline int search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
+{
+ return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ d_name, offset, res_dir);
+}
/*
* Directory block splitting, compacting
@@ -773,13 +1125,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-static void ext4_update_dx_flag(struct inode *inode)
-{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
- ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
-
/*
* NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
*
@@ -799,11 +1144,13 @@ static inline int ext4_match (int len, const char * const name,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static inline int search_dirblock(struct buffer_head *bh,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
@@ -811,8 +1158,8 @@ static inline int search_dirblock(struct buffer_head *bh,
const char *name = d_name->name;
int namelen = d_name->len;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ de = (struct ext4_dir_entry_2 *)search_buf;
+ dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
@@ -820,7 +1167,8 @@ static inline int search_dirblock(struct buffer_head *bh,
if ((char *) de + namelen <= dlimit &&
ext4_match (namelen, name, de)) {
/* found a match - just to be sure, do a full check */
- if (!ext4_check_dir_entry(dir, de, bh, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -836,6 +1184,21 @@ static inline int search_dirblock(struct buffer_head *bh,
return 0;
}
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+ struct ext4_dir_entry *de)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (!is_dx(dir))
+ return 0;
+ if (block == 0)
+ return 1;
+ if (de->inode == 0 &&
+ ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+ sb->s_blocksize)
+ return 1;
+ return 0;
+}
/*
* ext4_find_entry()
@@ -850,7 +1213,8 @@ static inline int search_dirblock(struct buffer_head *bh,
*/
static struct buffer_head * ext4_find_entry (struct inode *dir,
const struct qstr *d_name,
- struct ext4_dir_entry_2 ** res_dir)
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -871,8 +1235,20 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
namelen = d_name->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ &has_inline_data);
+ if (has_inline_data) {
+ if (inlined)
+ *inlined = 1;
+ return ret;
+ }
+ }
+
if ((namelen <= 2) && (name[0] == '.') &&
- (name[1] == '.' || name[1] == '0')) {
+ (name[1] == '.' || name[1] == '\0')) {
/*
* "." or ".." will only be in the first block
* NFS may look up ".."; "." should be handled by the VFS
@@ -921,7 +1297,8 @@ restart:
bh = ext4_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
if (bh)
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO,
+ 1, &bh);
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -934,6 +1311,17 @@ restart:
brelse(bh);
goto next;
}
+ if (!buffer_verified(bh) &&
+ !is_dx_internal_node(dir, block,
+ (struct ext4_dir_entry *)bh->b_data) &&
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(dir, "checksumming directory "
+ "block %lu", (unsigned long)block);
+ brelse(bh);
+ goto next;
+ }
+ set_buffer_verified(bh);
i = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
@@ -982,9 +1370,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto errout;
-
+ }
retval = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
@@ -1012,12 +1402,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
*err = -ENOENT;
errout:
- dxtrace(printk(KERN_DEBUG "%s not found\n", name));
+ dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
dx_release (frames);
return NULL;
}
-static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct ext4_dir_entry_2 *de;
@@ -1026,7 +1416,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1035,16 +1425,17 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
+ if (unlikely(ino == dir->i_ino)) {
+ EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+ dentry);
+ return ERR_PTR(-EIO);
+ }
inode = ext4_iget(dir->i_sb, ino);
- if (unlikely(IS_ERR(inode))) {
- if (PTR_ERR(inode) == -ESTALE) {
- EXT4_ERROR_INODE(dir,
- "deleted inode referenced: %u",
- ino);
- return ERR_PTR(-EIO);
- } else {
- return ERR_CAST(inode);
- }
+ if (inode == ERR_PTR(-ESTALE)) {
+ EXT4_ERROR_INODE(dir,
+ "deleted inode referenced: %u",
+ ino);
+ return ERR_PTR(-EIO);
}
}
return d_splice_alias(inode, dentry);
@@ -1054,14 +1445,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- static const struct qstr dotdot = {
- .name = "..",
- .len = 2,
- };
+ static const struct qstr dotdot = QSTR_INIT("..", 2);
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1076,24 +1464,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
- struct ext4_dir_entry_2 *de,
- umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
/*
* Move count entries from end of map between two memory locations.
* Returns pointer to last entry moved.
@@ -1161,13 +1531,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
char *data1 = (*bh)->b_data, *data2;
unsigned split, move, size;
struct ext4_dir_entry_2 *de = NULL, *de2;
+ struct ext4_dir_entry_tail *t;
+ int csum_size = 0;
int err = 0, i;
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2)) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- goto errout;
+ *error = PTR_ERR(bh2);
+ return NULL;
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -1209,10 +1586,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* Fancy dance to stay within two buffers */
de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
blocksize);
- de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+ de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+ (char *) de2,
blocksize);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data2, blocksize);
+ initialize_dirent_tail(t, blocksize);
+
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1223,10 +1610,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
de = de2;
}
dx_insert_block(frame, hash2 + continued, newblock);
- err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
if (err)
goto journal_error;
- err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
brelse(bh2);
@@ -1238,11 +1625,67 @@ journal_error:
brelse(bh2);
*bh = NULL;
ext4_std_error(dir->i_sb, err);
-errout:
*error = err;
return NULL;
}
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de)
+{
+ struct ext4_dir_entry_2 *de;
+ unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ int nlen, rlen;
+ unsigned int offset = 0;
+ char *top;
+
+ de = (struct ext4_dir_entry_2 *)buf;
+ top = buf + buf_size - reclen;
+ while ((char *) de <= top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ buf, buf_size, offset))
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if ((de->inode ? rlen - nlen : rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
+}
+
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen)
+{
+
+ int nlen, rlen;
+
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 =
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+ de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+}
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1258,30 +1701,20 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned int offset = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
- unsigned short reclen;
- int nlen, rlen, err;
- char *top;
+ int csum_size = 0;
+ int err;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
- reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
- de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + blocksize - reclen;
- while ((char *) de <= top) {
- if (!ext4_check_dir_entry(dir, de, bh, offset))
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
- de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
- offset += rlen;
- }
- if ((char *) de > top)
- return -ENOSPC;
+ err = ext4_find_dest_de(dir, inode,
+ bh, bh->b_data, blocksize - csum_size,
+ name, namelen, &de);
+ if (err)
+ return err;
}
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
@@ -1291,22 +1724,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
}
/* By now the buffer is marked for journaling */
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if (de->inode) {
- struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
- de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
- de = de1;
- }
- de->file_type = EXT4_FT_UNKNOWN;
- if (inode) {
- de->inode = cpu_to_le32(inode->i_ino);
- ext4_set_de_type(dir->i_sb, de, inode->i_mode);
- } else
- de->inode = 0;
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
+ ext4_insert_dentry(inode, de, blocksize, name, namelen);
+
/*
* XXX shouldn't update any times until successful
* completion of syscall, but too many callers depend
@@ -1323,7 +1742,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, dir, bh);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
return 0;
@@ -1344,6 +1763,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
+ struct ext4_dir_entry_tail *t;
char *data1, *top;
unsigned len;
int retval;
@@ -1351,9 +1771,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_hash_info hinfo;
ext4_lblk_t block;
struct fake_dirent *fde;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+ BUFFER_TRACE(bh, "get_write_access");
retval = ext4_journal_get_write_access(handle, bh);
if (retval) {
ext4_std_error(dir->i_sb, retval);
@@ -1371,13 +1797,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return -EIO;
}
- len = ((char *) root) + blocksize - (char *) de;
+ len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
- bh2 = ext4_append(handle, dir, &block, &retval);
- if (!(bh2)) {
+ bh2 = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh2)) {
brelse(bh);
- return retval;
+ return PTR_ERR(bh2);
}
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
@@ -1387,8 +1813,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
top = data1 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
@@ -1412,10 +1845,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+
+ ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ ext4_handle_dirty_dirent_node(handle, dir, bh);
+
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
@@ -1438,16 +1883,33 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
struct super_block *sb;
int retval;
int dx_fallback=0;
unsigned blocksize;
ext4_lblk_t block, blocks;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
+
+ if (ext4_has_inline_data(dir)) {
+ retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ if (retval < 0)
+ return retval;
+ if (retval == 1) {
+ retval = 0;
+ return retval;
+ }
+ }
+
if (is_dx(dir)) {
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -1458,9 +1920,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- bh = ext4_bread(handle, dir, block, 0, &retval);
- if(!bh)
- return retval;
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC) {
brelse(bh);
@@ -1472,12 +1935,18 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
- bh = ext4_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
+ bh = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
if (retval == 0)
@@ -1505,9 +1974,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
return err;
entries = frame->entries;
at = frame->at;
-
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
goto cleanup;
+ }
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
@@ -1536,9 +2008,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
err = -ENOSPC;
goto cleanup;
}
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
goto cleanup;
+ }
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
memset(&node2->fake, 0, sizeof(struct fake_dirent));
@@ -1576,7 +2050,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
dxtrace(dx_show_index("node", frames[1].entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
- err = ext4_handle_dirty_metadata(handle, inode, bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
@@ -1602,7 +2076,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ goto cleanup;
+ }
}
de = do_split(handle, dir, &bh, frame, &hinfo, &err);
if (!de)
@@ -1613,20 +2091,22 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
journal_error:
ext4_std_error(dir->i_sb, err);
cleanup:
- if (bh)
- brelse(bh);
+ brelse(bh);
dx_release(frames);
return err;
}
/*
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
*/
-static int ext4_delete_entry(handle_t *handle,
- struct inode *dir,
- struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
@@ -1634,13 +2114,12 @@ static int ext4_delete_entry(handle_t *handle,
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size) {
- if (!ext4_check_dir_entry(dir, de, bh, i))
+ de = (struct ext4_dir_entry_2 *)entry_buf;
+ while (i < buf_size - csum_size) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size, i))
return -EIO;
if (de == de_del) {
- BUFFER_TRACE(bh, "get_write_access");
- ext4_journal_get_write_access(handle, bh);
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
@@ -1651,8 +2130,6 @@ static int ext4_delete_entry(handle_t *handle,
else
de->inode = 0;
dir->i_version++;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, dir, bh);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1662,6 +2139,48 @@ static int ext4_delete_entry(handle_t *handle,
return -ENOENT;
}
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
+{
+ int err, csum_size = 0;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+ &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del,
+ bh, bh->b_data,
+ dir->i_sb->s_blocksize, csum_size);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (unlikely(err))
+ goto out;
+
+ return 0;
+out:
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
/*
* DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
* since this indicates that nlinks count was previously 1.
@@ -1672,7 +2191,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
if (is_dx(inode) && inode->i_nlink > 1) {
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
}
@@ -1685,9 +2204,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
*/
static void ext4_dec_count(handle_t *handle, struct inode *inode)
{
- drop_nlink(inode);
- if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
- inc_nlink(inode);
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
}
@@ -1697,8 +2215,8 @@ static int ext4_add_nondir(handle_t *handle,
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
return 0;
}
drop_nlink(inode);
@@ -1715,134 +2233,215 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
static int ext4_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (!new_valid_dev(rdev))
return -EINVAL;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
-#endif
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
- struct buffer_head *dir_block;
- struct ext4_dir_entry_2 *de;
- unsigned int blocksize = dir->i_sb->s_blocksize;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
-
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
- &dentry->d_name, 0);
+ inode = ext4_new_inode_start_handle(dir, mode,
+ NULL, 0, NULL,
+ EXT4_HT_DIR,
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT4_XATTR_TRANS_BLOCKS);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ d_tmpfile(dentry, inode);
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto err_unlock_inode;
+ mark_inode_dirty(inode);
+ unlock_new_inode(inode);
+ }
+ if (handle)
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_unlock_inode:
+ ext4_journal_stop(handle);
+ unlock_new_inode(inode);
+ return err;
+}
- inode->i_op = &ext4_dir_inode_operations;
- inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
- if (!dir_block)
- goto out_clear_inode;
- BUFFER_TRACE(dir_block, "get_write_access");
- ext4_journal_get_write_access(handle, dir_block);
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len)
+{
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
blocksize);
strcpy(de->name, ".");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
de = ext4_next_entry(de, blocksize);
- de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
- blocksize);
+ de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
+ if (!dotdot_real_len)
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + EXT4_DIR_REC_LEN(1)),
+ blocksize);
+ else
+ de->rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(de->name_len), blocksize);
strcpy(de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode)
+{
+ struct buffer_head *dir_block = NULL;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ ext4_lblk_t block = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ int csum_size = 0;
+ int err;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ err = ext4_try_create_inline_dir(handle, dir, inode);
+ if (err < 0 && err != -ENOSPC)
+ goto out;
+ if (!err)
+ goto out;
+ }
+
+ inode->i_size = 0;
+ dir_block = ext4_append(handle, inode, &block);
+ if (IS_ERR(dir_block))
+ return PTR_ERR(dir_block);
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out;
+ de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+ ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+ set_nlink(inode, 2);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, dir, dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
brelse(dir_block);
- ext4_mark_inode_dirty(handle, inode);
- err = ext4_add_entry(handle, dentry, inode);
+ return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, credits, retries = 0;
+
+ if (EXT4_DIR_LINK_MAX(dir))
+ return -EMLINK;
+
+ dquot_initialize(dir);
+
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+ inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ err = ext4_init_new_dir(handle, dir, inode);
+ if (err)
+ goto out_clear_inode;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (!err)
+ err = ext4_add_entry(handle, dentry, inode);
if (err) {
out_clear_inode:
clear_nlink(inode);
@@ -1853,11 +2452,17 @@ out_clear_inode:
}
ext4_inc_count(handle, dir);
ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
- d_instantiate(dentry, inode);
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (err)
+ goto out_clear_inode;
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -1874,18 +2479,23 @@ static int empty_dir(struct inode *inode)
struct super_block *sb;
int err = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+
+ err = empty_inline_dir(inode, &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory lblock 0", err);
- else
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ EXT4_ERROR_INODE(inode, "invalid size");
return 1;
}
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh))
+ return 1;
+
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -1902,24 +2512,18 @@ static int empty_dir(struct inode *inode)
ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
- if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
unsigned int lblock;
err = 0;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- bh = ext4_bread(NULL, inode, lblock, 0, &err);
- if (!bh) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory "
- "lblock %u", err, lblock);
- offset += sb->s_blocksize;
- continue;
- }
+ bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (IS_ERR(bh))
+ return 1;
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
- if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+ if (ext4_check_dir_entry(inode, NULL, de, bh,
+ bh->b_data, bh->b_size, offset)) {
de = (struct ext4_dir_entry_2 *)(bh->b_data +
sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -1936,85 +2540,92 @@ static int empty_dir(struct inode *inode)
return 1;
}
-/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
* such inodes, starting at the superblock, in case we crash before the
* file is closed/deleted, or in case the inode truncate spans multiple
* transactions and the last transaction is not recovered after a crash.
*
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_iloc iloc;
int err = 0, rc;
+ bool dirty = false;
- if (!ext4_handle_valid(handle))
+ if (!sbi->s_journal)
return 0;
- mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /*
+ * Exit early if inode already is on orphan list. This is a big speedup
+ * since we don't have to contend on the global s_orphan_lock.
+ */
if (!list_empty(&EXT4_I(inode)->i_orphan))
- goto out_unlock;
-
- /* Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. */
+ return 0;
- /* @@@ FIXME: Observation from aviro:
- * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
- * here (on s_orphan_lock), so race with ext4_link() which might bump
- * ->i_nlink. For, say it, character device. Not a regular file,
- * not a directory, not a symlink and ->i_nlink > 0.
- *
- * tytso, 4/25/2009: I'm not sure how that could happen;
- * shouldn't the fs core protect us from these sort of
- * unlink()/link() races?
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_mutex, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
*/
J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
- goto out_unlock;
+ goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- goto out_unlock;
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
/*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
- if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
- goto mem_insert;
-
- /* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
- EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
-
- /* Only add to the head of the in-memory list if all the
- * previous operations succeeded. If the orphan_add is going to
- * fail (possibly taking the journal offline), we can't risk
- * leaving the inode on the orphan list: stray orphan-list
- * entries can cause panics at unmount time.
- *
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
-mem_insert:
- if (!err)
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_super(handle, sb);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ }
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
- mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
- ext4_std_error(inode->i_sb, err);
+out:
+ ext4_std_error(sb, err);
return err;
}
@@ -2026,46 +2637,52 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 ino_next;
struct ext4_iloc iloc;
int err = 0;
- /* ext4_handle_valid() assumes a valid handle_t pointer */
- if (handle && !ext4_handle_valid(handle))
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /* Do this quick check before taking global s_orphan_lock. */
if (list_empty(&ei->i_orphan))
- goto out;
+ return 0;
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
- sbi = EXT4_SB(inode->i_sb);
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+ mutex_lock(&sbi->s_orphan_lock);
jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
/* If we're on an error path, we may not have a valid
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (sbi->s_journal && !handle)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_err;
+ }
+ ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
@@ -2074,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
jbd_debug(4, "orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
NEXT_ORPHAN(i_prev) = ino_next;
err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
}
if (err)
goto out_brelse;
NEXT_ORPHAN(inode) = 0;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
out_err:
ext4_std_error(inode->i_sb, err);
-out:
- mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
return err;
out_brelse:
@@ -2101,25 +2718,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
inode = dentry->d_inode;
retval = -EIO;
@@ -2130,6 +2740,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (!empty_dir(inode))
goto end_rmdir;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rmdir;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
@@ -2151,8 +2772,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_mark_inode_dirty(handle, dir);
end_rmdir:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
@@ -2162,22 +2784,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
+ trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_unlink;
@@ -2187,11 +2803,22 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_unlink;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
if (!inode->i_nlink) {
ext4_warning(inode->i_sb,
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
@@ -2207,8 +2834,10 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = 0;
end_unlink:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ trace_ext4_unlink_exit(dentry, retval);
return retval;
}
@@ -2218,6 +2847,7 @@ static int ext4_symlink(struct inode *dir,
handle_t *handle;
struct inode *inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2225,37 +2855,72 @@ static int ext4_symlink(struct inode *dir,
dquot_initialize(dir);
+ if (l > EXT4_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks, and
+ * possibly selinux xattr blocks.
+ */
+ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT4_XATTR_TRANS_BLOCKS;
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+ }
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
- &dentry->d_name, 0);
+ inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof(EXT4_I(inode)->i_data)) {
+ if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
- * page_symlink() calls into ext4_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext4_write_begin() which can wait
+ * for transaction commit if we are running out of space
+ * and thus we deadlock. So we have to stop transaction now
+ * and restart it when symlink contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext4_orphan_add(handle, inode);
+ ext4_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+ * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ set_nlink(inode, 1);
+ err = ext4_orphan_del(handle, inode);
if (err) {
+ ext4_journal_stop(handle);
clear_nlink(inode);
- unlock_new_inode(inode);
- ext4_mark_inode_dirty(handle, inode);
- iput(inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
/* clear the extent format for fast symlink */
@@ -2266,11 +2931,19 @@ retry:
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext4_link(struct dentry *old_dentry,
@@ -2285,16 +2958,10 @@ static int ext4_link(struct dentry *old_dentry,
dquot_initialize(dir);
- /*
- * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
- * otherwise has the potential to corrupt the orphan inode list.
- */
- if (inode->i_nlink == 0)
- return -ENOENT;
-
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2308,6 +2975,11 @@ retry:
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
} else {
drop_nlink(inode);
@@ -2319,178 +2991,457 @@ retry:
return err;
}
-#define PARENT_INO(buffer, size) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+ struct inode *inode,
+ int *retval,
+ struct ext4_dir_entry_2 **parent_de,
+ int *inlined)
+{
+ struct buffer_head *bh;
+
+ if (!ext4_has_inline_data(inode)) {
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh)) {
+ *retval = PTR_ERR(bh);
+ return NULL;
+ }
+ *parent_de = ext4_next_entry(
+ (struct ext4_dir_entry_2 *)bh->b_data,
+ inode->i_sb->s_blocksize);
+ return bh;
+ }
+
+ *inlined = 1;
+ return ext4_get_first_inline_block(inode, parent_de, retval);
+}
+
+struct ext4_renament {
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool is_dir;
+ int dir_nlink_delta;
+
+ /* entry for "dentry" */
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ int inlined;
+
+ /* entry for ".." in inode if it's a directory */
+ struct buffer_head *dir_bh;
+ struct ext4_dir_entry_2 *parent_de;
+ int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+
+ ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+ &retval, &ent->parent_de,
+ &ent->dir_inlined);
+ if (!ent->dir_bh)
+ return retval;
+ if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+ return -EIO;
+ BUFFER_TRACE(ent->dir_bh, "get_write_access");
+ return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+ unsigned dir_ino)
+{
+ int retval;
+
+ ent->parent_de->inode = cpu_to_le32(dir_ino);
+ BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+ if (!ent->dir_inlined) {
+ if (is_dx(ent->inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ }
+ } else {
+ retval = ext4_mark_inode_dirty(handle, ent->inode);
+ }
+ if (retval) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ int retval;
+
+ BUFFER_TRACE(ent->bh, "get write access");
+ retval = ext4_journal_get_write_access(handle, ent->bh);
+ if (retval)
+ return retval;
+ ent->de->inode = cpu_to_le32(ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ ent->de->file_type = file_type;
+ ent->dir->i_version++;
+ ent->dir->i_ctime = ent->dir->i_mtime =
+ ext4_current_time(ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+ if (!ent->inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->dir, ent->bh);
+ if (unlikely(retval)) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ }
+ brelse(ent->bh);
+ ent->bh = NULL;
+
+ return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+ const struct qstr *d_name)
+{
+ int retval = -ENOENT;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (bh) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ brelse(bh);
+ }
+ return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+ /*
+ * ent->de could have moved from under us during htree split, so make
+ * sure that we are deleting the right entry. We might also be pointing
+ * to a stale entry in the unused part of ent->bh so just checking inum
+ * and the name isn't enough.
+ */
+ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+ ent->de->name_len != ent->dentry->d_name.len ||
+ strncmp(ent->de->name, ent->dentry->d_name.name,
+ ent->de->name_len)) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ } else {
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ if (retval == -ENOENT) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ }
+ }
+
+ if (retval) {
+ ext4_warning(ent->dir->i_sb,
+ "Deleting old file (%lu), %d, error=%d",
+ ent->dir->i_ino, ent->dir->i_nlink, retval);
+ }
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+ if (ent->dir_nlink_delta) {
+ if (ent->dir_nlink_delta == -1)
+ ext4_dec_count(handle, ent->dir);
+ else
+ ext4_inc_count(handle, ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ }
+}
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
+ *
+ * n.b. old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- handle_t *handle;
- struct inode *old_inode, *new_inode;
- struct buffer_head *old_bh, *new_bh, *dir_bh;
- struct ext4_dir_entry_2 *old_de, *new_de;
- int retval, force_da_alloc = 0;
-
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
+ int retval;
- old_bh = new_bh = dir_bh = NULL;
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new_dentry->d_inode)
- dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, 2 *
- EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
+ if (new.inode)
+ dquot_initialize(new.inode);
- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- old_inode = old_dentry->d_inode;
retval = -ENOENT;
- if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
- new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
- if (new_bh) {
- if (!new_inode) {
- brelse(new_bh);
- new_bh = NULL;
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+ if (new.bh) {
+ if (!new.inode) {
+ brelse(new.bh);
+ new.bh = NULL;
}
}
- if (S_ISDIR(old_inode->i_mode)) {
- if (new_inode) {
+ if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old.inode);
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir(new_inode))
+ if (!empty_dir(new.inode))
+ goto end_rename;
+ } else {
+ retval = -EMLINK;
+ if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = -EIO;
- dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
- if (!dir_bh)
- goto end_rename;
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
- old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
- if (!new_inode && new_dir != old_dir &&
- EXT4_DIR_LINK_MAX(new_dir))
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
goto end_rename;
}
- if (!new_bh) {
- retval = ext4_add_entry(handle, new_dentry, old_inode);
+ if (!new.bh) {
+ retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
- BUFFER_TRACE(new_bh, "get write access");
- ext4_journal_get_write_access(handle, new_bh);
- new_de->inode = cpu_to_le32(old_inode->i_ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
- new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime =
- ext4_current_time(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
- BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, new_dir, new_bh);
- brelse(new_bh);
- new_bh = NULL;
+ retval = ext4_setent(handle, &new,
+ old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
}
/*
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
- ext4_mark_inode_dirty(handle, old_inode);
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
/*
* ok, that's it
*/
- if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
- old_de->name_len != old_dentry->d_name.len ||
- strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
- (retval = ext4_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
- /* old_de could have moved from under us during htree split, so
- * make sure that we are deleting the right entry. We might
- * also be pointing to a stale entry in the unused part of
- * old_bh so just checking inum and the name isn't enough. */
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
-
- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
- brelse(old_bh2);
- }
+ ext4_rename_delete(handle, &old);
+
+ if (new.inode) {
+ ext4_dec_count(handle, new.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
}
- if (retval) {
- ext4_warning(old_dir->i_sb,
- "Deleting old file (%lu), %d, error=%d",
- old_dir->i_ino, old_dir->i_nlink, retval);
- }
-
- if (new_inode) {
- ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
- }
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
- ext4_update_dx_flag(old_dir);
- if (dir_bh) {
- BUFFER_TRACE(dir_bh, "get_write_access");
- ext4_journal_get_write_access(handle, dir_bh);
- PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
- cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
- ext4_dec_count(handle, old_dir);
- if (new_inode) {
+ old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ ext4_update_dx_flag(old.dir);
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+
+ ext4_dec_count(handle, old.dir);
+ if (new.inode) {
/* checked empty_dir above, can't have another parent,
* ext4_dec_count() won't work for many-linked dirs */
- new_inode->i_nlink = 0;
+ clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new_dir);
- ext4_update_dx_flag(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
+ ext4_inc_count(handle, new.dir);
+ ext4_update_dx_flag(new.dir);
+ ext4_mark_inode_dirty(handle, new.dir);
}
}
- ext4_mark_inode_dirty(handle, old_dir);
- if (new_inode) {
- ext4_mark_inode_dirty(handle, new_inode);
- if (!new_inode->i_nlink)
- ext4_orphan_add(handle, new_inode);
- if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- force_da_alloc = 1;
+ ext4_mark_inode_dirty(handle, old.dir);
+ if (new.inode) {
+ ext4_mark_inode_dirty(handle, new.inode);
+ if (!new.inode->i_nlink)
+ ext4_orphan_add(handle, new.inode);
}
retval = 0;
end_rename:
- brelse(dir_bh);
- brelse(old_bh);
- brelse(new_bh);
- ext4_journal_stop(handle);
- if (retval == 0 && force_da_alloc)
- ext4_alloc_da_blocks(old_inode);
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
+ u8 new_file_type;
+ int retval;
+
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
+
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+ &old.de, &old.inlined);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ retval = -ENOENT;
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+ goto end_rename;
+
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+
+ /* RENAME_EXCHANGE case: old *and* new must both exist */
+ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+ goto end_rename;
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ old.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
+ goto end_rename;
+ }
+ if (S_ISDIR(new.inode->i_mode)) {
+ new.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &new);
+ if (retval)
+ goto end_rename;
+ }
+
+ /*
+ * Other than the special case of overwriting a directory, parents'
+ * nlink only needs to be modified if this is a cross directory rename.
+ */
+ if (old.dir != new.dir && old.is_dir != new.is_dir) {
+ old.dir_nlink_delta = old.is_dir ? -1 : 1;
+ new.dir_nlink_delta = -old.dir_nlink_delta;
+ retval = -EMLINK;
+ if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+ (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+ goto end_rename;
+ }
+
+ new_file_type = new.de->file_type;
+ retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
+
+ retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+ if (retval)
+ goto end_rename;
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
+ ext4_mark_inode_dirty(handle, new.inode);
+
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ if (new.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ ext4_update_dir_count(handle, &old);
+ ext4_update_dir_count(handle, &new);
+ retval = 0;
+
+end_rename:
+ brelse(old.dir_bh);
+ brelse(new.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return ext4_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+ /*
+ * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+ * is equivalent to regular rename.
+ */
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
/*
* directories can handle most operations...
*/
@@ -2503,25 +3454,25 @@ const struct inode_operations ext4_dir_inode_operations = {
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
+ .tmpfile = ext4_tmpfile,
.rename = ext4_rename,
+ .rename2 = ext4_rename2,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .check_acl = ext4_check_acl,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .check_acl = ext4_check_acl,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7f5451cd1d3..b24a2541a9b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -6,7 +6,6 @@
* Written by Theodore Ts'o, 2010.
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -19,258 +18,323 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
-
-#define WQ_HASH_SZ 37
-#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+static struct kmem_cache *io_end_cachep;
int __init ext4_init_pageio(void)
{
- int i;
-
- io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
- if (io_page_cachep == NULL)
- return -ENOMEM;
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
- if (io_page_cachep == NULL) {
- kmem_cache_destroy(io_page_cachep);
+ if (io_end_cachep == NULL)
return -ENOMEM;
- }
- for (i = 0; i < WQ_HASH_SZ; i++)
- init_waitqueue_head(&ioend_wq[i]);
-
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
- kmem_cache_destroy(io_page_cachep);
}
-void ext4_ioend_wait(struct inode *inode)
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+ char b[BDEVNAME_SIZE];
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
{
- wait_queue_head_t *wq = to_ioend_wq(inode);
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
+ }
}
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_page->p_count)) {
- end_page_writeback(io_page->p_page);
- put_page(io_page->p_page);
- kmem_cache_free(io_page_cachep, io_page);
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
+
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
}
+ kmem_cache_free(io_end_cachep, io_end);
}
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- int i;
- wait_queue_head_t *wq;
-
- BUG_ON(!io);
- if (io->page)
- put_page(io->page);
- for (i = 0; i < io->num_io_pages; i++)
- put_io_page(io->pages[i]);
- io->num_io_pages = 0;
- wq = to_ioend_wq(io->inode);
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
- waitqueue_active(wq))
- wake_up_all(wq);
- kmem_cache_free(io_end_cachep, io);
+ struct inode *inode = io_end->inode;
+
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
}
/*
- * check a range of space and convert unwritten extents to written.
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
*/
-int ext4_end_io_nolock(ext4_io_end_t *io)
+static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- if (list_empty(&io->list))
- return ret;
-
- if (!(io->flag & EXT4_IO_END_UNWRITTEN))
- return ret;
-
- ret = ext4_convert_unwritten_extents(inode, offset, size);
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
- printk(KERN_EMERG "%s: failed to convert unwritten "
- "extents to written extents, error is %d "
- "io is still on inode %lu aio dio list\n",
- __func__, ret, inode->i_ino);
- return ret;
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to written "
+ "extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, offset, size, ret);
}
-
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
- /* clear the DIO AIO unwritten flag */
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
return ret;
}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret;
-
- mutex_lock(&inode->i_mutex);
- ret = ext4_end_io_nolock(io);
- if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(head))
return;
+
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
}
+#endif
+}
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle && sbi->s_journal);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (!list_empty(&io->list))
- list_del_init(&io->list);
+ wq = sbi->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- mutex_unlock(&inode->i_mutex);
- ext4_free_io_end(io);
}
-ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
{
- ext4_io_end_t *io = NULL;
+ ext4_io_end_t *io;
+ struct list_head unwritten;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
- io = kmem_cache_alloc(io_end_cachep, flags);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+ }
+ return ret;
+}
+
+/*
+ * work on completed IO, to convert unwritten extents to extents
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+ ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
if (io) {
- memset(io, 0, sizeof(*io));
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
- INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
}
return io;
}
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c. This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message. We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
}
+/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
- struct inode *inode;
- unsigned long flags;
- int i;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
BUG_ON(!io_end);
- bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- bio_put(bio);
-
- for (i = 0; i < io_end->num_io_pages; i++) {
- struct page *page = io_end->pages[i]->p_page;
- struct buffer_head *bh, *head;
- int partial_write = 0;
-
- head = page_buffers(page);
- if (error)
- SetPageError(page);
- BUG_ON(!head);
- if (head->b_size == PAGE_CACHE_SIZE)
- clear_buffer_dirty(head);
- else {
- loff_t offset;
- loff_t io_end_offset = io_end->offset + io_end->size;
-
- offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
- bh = head;
- do {
- if ((offset >= io_end->offset) &&
- (offset+bh->b_size <= io_end_offset)) {
- if (error)
- buffer_io_error(bh);
-
- clear_buffer_dirty(bh);
- }
- if (buffer_delay(bh))
- partial_write = 1;
- else if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- else if (buffer_dirty(bh))
- partial_write = 1;
- offset += bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- put_io_page(io_end->pages[i]);
-
- /*
- * If this is a partial write which happened to make
- * all buffers uptodate then we can optimize away a
- * bogus readpage() for the next read(). Here we
- * 'discover' whether the page went uptodate as a
- * result of this (potentially partial) write.
- */
- if (!partial_write)
- SetPageUptodate(page);
- }
- io_end->num_io_pages = 0;
- inode = io_end->inode;
if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ struct inode *inode = io_end->inode;
+
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- inode->i_ino,
+ error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
- bio->bi_sector >> (inode->i_blkbits - 9));
+ bi_sector >> (inode->i_blkbits - 9));
+ mapping_set_error(inode->i_mapping, error);
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
}
void ext4_io_submit(struct ext4_io_submit *io)
@@ -283,149 +347,151 @@ void ext4_io_submit(struct ext4_io_submit *io)
BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
bio_put(io->io_bio);
}
- io->io_bio = 0;
- io->io_op = 0;
- io->io_end = 0;
+ io->io_bio = NULL;
}
-static int io_submit_init(struct ext4_io_submit *io,
- struct inode *inode,
- struct writeback_control *wbc,
- struct buffer_head *bh)
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
+ io->io_end = NULL;
+}
+
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
+ bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
return -ENOMEM;
- do {
- bio = bio_alloc(GFP_NOIO, nvecs);
- nvecs >>= 1;
- } while (bio == NULL);
-
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
-
- io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+ bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC_PLUG : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
- struct ext4_io_page *io_page,
struct inode *inode,
- struct writeback_control *wbc,
struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
int ret;
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
- }
-
- if (!buffer_mapped(bh) || buffer_delay(bh)) {
- if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
- return 0;
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init(io, inode, wbc, bh);
+ ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
- io_end = io->io_end;
- if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
- (io_end->pages[io_end->num_io_pages-1] != io_page))
- goto submit_and_retry;
- if (buffer_uninit(bh))
- io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
- io->io_end->size += bh->b_size;
- io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- if ((io_end->num_io_pages == 0) ||
- (io_end->pages[io_end->num_io_pages-1] != io_page)) {
- io_end->pages[io_end->num_io_pages++] = io_page;
- atomic_inc(&io_page->p_count);
- }
+ io->io_next_block++;
return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ bool keep_towrite)
{
struct inode *inode = page->mapping->host;
- unsigned block_start, block_end, blocksize;
- struct ext4_io_page *io_page;
+ unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
+ int nr_submitted = 0;
blocksize = 1 << inode->i_blkbits;
+ BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- set_page_writeback(page);
+
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
ClearPageError(page);
- io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
- if (!io_page) {
- set_page_dirty(page);
- unlock_page(page);
- return -ENOMEM;
- }
- io_page->p_page = page;
- atomic_set(&io_page->p_count, 1);
- get_page(page);
-
- for (bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start = block_end, bh = bh->b_this_page) {
- block_end = block_start + blocksize;
+ /*
+ * Comments copied from block_write_full_page:
+ *
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ if (len < PAGE_CACHE_SIZE)
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ /*
+ * In the first loop we prepare and mark buffers to submit. We have to
+ * mark all buffers in the page before submitting so that
+ * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * on the first buffer finishes and we are still working on submitting
+ * the second buffer.
+ */
+ bh = head = page_buffers(page);
+ do {
+ block_start = bh_offset(bh);
if (block_start >= len) {
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
}
- ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ continue;
+ }
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ set_buffer_async_write(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Now submit buffers to write */
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_async_write(bh))
+ continue;
+ ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
* we can do but mark the page as dirty, and
* better luck next time.
*/
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
break;
}
+ nr_submitted++;
+ clear_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Error stopped previous loop? Clean up buffers... */
+ if (ret) {
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
}
unlock_page(page);
- /*
- * If the page was truncated before we could do the writeback,
- * or we had a memory allocation error while trying to write
- * the first buffer head, we won't have submitted any pages for
- * I/O. In that case we need to make sure we've cleared the
- * PageWriteback bit from the page to prevent the system from
- * wedging later on.
- */
- put_io_page(io_page);
+ /* Nothing submitted - we have to end page writeback */
+ if (!nr_submitted)
+ end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..bb0e80f03e2 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,57 @@
#include "ext4_jbd2.h"
+int ext4_resize_begin(struct super_block *sb)
+{
+ int ret = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ /*
+ * We are not allowed to do online-resizing on a filesystem mounted
+ * with error, because it can destroy the filesystem easily.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_warning(sb, "There are errors in the filesystem, "
+ "so online resizing is not allowed\n");
+ return -EPERM;
+ }
+
+ if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+ ret = -EBUSY;
+
+ return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+ clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+ smp_mb__after_atomic();
+}
+
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+ ext4_group_t group) {
+ return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+ ext4_group_t group) {
+ group = ext4_meta_bg_first_group(sb, group);
+ return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+ ext4_group_t group) {
+ ext4_grpblk_t overhead;
+ overhead = ext4_bg_num_gdb(sb, group);
+ if (ext4_bg_has_super(sb, group))
+ overhead += 1 +
+ le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ return overhead;
+}
+
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -28,14 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_bg_has_super(sb, group) ?
- (1 + ext4_bg_num_gdb(sb, group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -47,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, "Cannot add at group %u (only %u groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
+ if (offset != 0)
ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -105,6 +159,186 @@ static int verify_group_input(struct super_block *sb,
return err;
}
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+ struct ext4_new_group_data *groups; /* new_group_data for groups
+ in the flex group */
+ __u16 *bg_flags; /* block group flags of groups
+ in @groups */
+ ext4_group_t count; /* number of groups in @groups
+ */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+ struct ext4_new_flex_group_data *flex_gd;
+
+ flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+ if (flex_gd == NULL)
+ goto out3;
+
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ goto out2;
+ flex_gd->count = flexbg_size;
+
+ flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+ flexbg_size, GFP_NOFS);
+ if (flex_gd->groups == NULL)
+ goto out2;
+
+ flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+ if (flex_gd->bg_flags == NULL)
+ goto out1;
+
+ return flex_gd;
+
+out1:
+ kfree(flex_gd->groups);
+out2:
+ kfree(flex_gd);
+out3:
+ return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+ kfree(flex_gd->bg_flags);
+ kfree(flex_gd->groups);
+ kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize. Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
+ */
+static int ext4_alloc_group_tables(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ int flexbg_size)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t start_blk;
+ ext4_fsblk_t last_blk;
+ ext4_group_t src_group;
+ ext4_group_t bb_index = 0;
+ ext4_group_t ib_index = 0;
+ ext4_group_t it_index = 0;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ unsigned overhead;
+ __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+ src_group = group_data[0].group;
+ last_group = src_group + flex_gd->count - 1;
+
+ BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+ (last_group & ~(flexbg_size - 1))));
+next_group:
+ group = group_data[0].group;
+ if (src_group >= group_data[0].group + flex_gd->count)
+ return -ENOSPC;
+ start_blk = ext4_group_first_block_no(sb, src_group);
+ last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+
+ start_blk += overhead;
+
+ /* We collect contiguous blocks as much as possible. */
+ src_group++;
+ for (; src_group <= last_group; src_group++) {
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+ if (overhead == 0)
+ last_blk += group_data[src_group - group].blocks_count;
+ else
+ break;
+ }
+
+ /* Allocate block bitmaps */
+ for (; bb_index < flex_gd->count; bb_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[bb_index].block_bitmap = start_blk++;
+ group = ext4_get_group_number(sb, start_blk - 1);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ }
+
+ /* Allocate inode bitmaps */
+ for (; ib_index < flex_gd->count; ib_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[ib_index].inode_bitmap = start_blk++;
+ group = ext4_get_group_number(sb, start_blk - 1);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ }
+
+ /* Allocate inode tables */
+ for (; it_index < flex_gd->count; it_index++) {
+ unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+ ext4_fsblk_t next_group_start;
+
+ if (start_blk + itb > last_blk)
+ goto next_group;
+ group_data[it_index].inode_table = start_blk;
+ group = ext4_get_group_number(sb, start_blk);
+ next_group_start = ext4_group_first_block_no(sb, group + 1);
+ group -= group_data[0].group;
+
+ if (start_blk + itb > next_group_start) {
+ flex_gd->bg_flags[group + 1] &= uninit_mask;
+ overhead = start_blk + itb - next_group_start;
+ group_data[group + 1].free_blocks_count -= overhead;
+ itb -= overhead;
+ }
+
+ group_data[group].free_blocks_count -= itb;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ start_blk += EXT4_SB(sb)->s_itb_per_group;
+ }
+
+ if (test_opt(sb, DEBUG)) {
+ int i;
+ group = group_data[0].group;
+
+ printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+ "%d groups, flexbg size is %d:\n", flex_gd->count,
+ flexbg_size);
+
+ for (i = 0; i < flex_gd->count; i++) {
+ printk(KERN_DEBUG "adding %s group %u: %u "
+ "blocks (%d free)\n",
+ ext4_bg_has_super(sb, group + i) ? "normal" :
+ "no-super", group + i,
+ group_data[i].blocks_count,
+ group_data[i].free_blocks_count);
+ }
+ }
+ return 0;
+}
+
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
ext4_fsblk_t blk)
{
@@ -112,16 +346,15 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
} else {
- lock_buffer(bh);
memset(bh->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
}
return bh;
@@ -132,8 +365,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for block_bitmap modifications.
*/
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
- struct buffer_head *bh)
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
{
int err;
@@ -144,134 +376,277 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
if (err < 0)
return err;
if (err) {
- if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+ if (err)
return err;
- if ((err = ext4_journal_get_write_access(handle, bh)))
+ }
+
+ return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t block, ext4_group_t count)
+{
+ ext4_group_t count2;
+
+ ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+ for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_fsblk_t start;
+ struct buffer_head *bh;
+ ext4_group_t group;
+ int err;
+
+ group = ext4_get_group_number(sb, block);
+ start = ext4_group_first_block_no(sb, group);
+ group -= flex_gd->groups[0].group;
+
+ count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+ if (count2 > count)
+ count2 = count;
+
+ if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+ BUG_ON(flex_gd->count > 1);
+ continue;
+ }
+
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
return err;
+
+ bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+ if (unlikely(!bh))
+ return -ENOMEM;
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+ block - start, count2);
+ ext4_set_bits(bh->b_data, block - start, count2);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ return err;
+ brelse(bh);
}
return 0;
}
/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
* This doesn't need to be part of the main transaction, since we are only
* changing blocks outside the actual filesystem. We still do journaling to
* ensure the recovery is correct in case of a failure just after resize.
* If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ * 1. copy super block and GDT, and initialize group tables if necessary.
+ * In this step, we only set bits in blocks bitmaps for blocks taken by
+ * super block and GDT.
+ * 2. allocate group tables in block bitmaps, that is, set bits in block
+ * bitmap for blocks taken by group tables.
*/
-static int setup_new_group_blocks(struct super_block *sb,
- struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
{
+ int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+ ext4_fsblk_t start;
+ ext4_fsblk_t block;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
- int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
- unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
- struct buffer_head *bh;
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ __u16 *bg_flags = flex_gd->bg_flags;
handle_t *handle;
- ext4_fsblk_t block;
- ext4_grpblk_t bit;
- int i;
- int err = 0, err2;
+ ext4_group_t group, count;
+ struct buffer_head *bh = NULL;
+ int reserved_gdb, i, j, err = 0, err2;
+ int meta_bg;
- /* This transaction may be extended/restarted along the way */
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ BUG_ON(!flex_gd->count || !group_data ||
+ group_data[0].group != sbi->s_groups_count);
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+ /* This transaction may be extended/restarted along the way */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle))
return PTR_ERR(handle);
- mutex_lock(&sbi->s_resize_lock);
- if (input->group != sbi->s_groups_count) {
- err = -EBUSY;
- goto exit_journal;
- }
+ group = group_data[0].group;
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ unsigned long gdblocks;
+ ext4_grpblk_t overhead;
- if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
+ gdblocks = ext4_bg_num_gdb(sb, group);
+ start = ext4_group_first_block_no(sb, group);
- if (ext4_bg_has_super(sb, input->group)) {
- ext4_debug("mark backup superblock %#04llx (+0)\n", start);
- ext4_set_bit(0, bh->b_data);
- }
+ if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+ goto handle_itb;
- /* Copy all of the GDT blocks into the backup in this group */
- for (i = 0, bit = 1, block = start + 1;
- i < gdblocks; i++, block++, bit++) {
- struct buffer_head *gdb;
+ if (meta_bg == 1) {
+ ext4_group_t first_group;
+ first_group = ext4_meta_bg_first_group(sb, group);
+ if (first_group != group + 1 &&
+ first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+ goto handle_itb;
+ }
- ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+ block = start + ext4_bg_has_super(sb, group);
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (j = 0; j < gdblocks; j++, block++) {
+ struct buffer_head *gdb;
+
+ ext4_debug("update backup group %#04llx\n", block);
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ gdb = sb_getblk(sb, block);
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ BUFFER_TRACE(gdb, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto out;
+ }
+ memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+ gdb->b_size);
+ set_buffer_uptodate(gdb);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+ if (unlikely(err)) {
+ brelse(gdb);
+ goto out;
+ }
+ brelse(gdb);
+ }
- if ((err = extend_or_restart_transaction(handle, 1, bh)))
- goto exit_bh;
+ /* Zero out all of the reserved backup group descriptor
+ * table blocks
+ */
+ if (ext4_bg_has_super(sb, group)) {
+ err = sb_issue_zeroout(sb, gdblocks + start + 1,
+ reserved_gdb, GFP_NOFS);
+ if (err)
+ goto out;
+ }
- gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
- goto exit_bh;
+handle_itb:
+ /* Initialize group tables of the grop @group */
+ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+ goto handle_bb;
+
+ /* Zero out all of the inode table blocks */
+ block = group_data[i].inode_table;
+ ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+ block, sbi->s_itb_per_group);
+ err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+ GFP_NOFS);
+ if (err)
+ goto out;
+
+handle_bb:
+ if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+ goto handle_ib;
+
+ /* Initialize block bitmap of the @group */
+ block = group_data[i].block_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
}
- if ((err = ext4_journal_get_write_access(handle, gdb))) {
- brelse(gdb);
- goto exit_bh;
+ overhead = ext4_group_overhead_blocks(sb, group);
+ if (overhead != 0) {
+ ext4_debug("mark backup superblock %#04llx (+0)\n",
+ start);
+ ext4_set_bits(bh->b_data, 0, overhead);
}
- lock_buffer(gdb);
- memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
- set_buffer_uptodate(gdb);
- unlock_buffer(gdb);
- ext4_handle_dirty_metadata(handle, NULL, gdb);
- ext4_set_bit(bit, bh->b_data);
- brelse(gdb);
- }
-
- /* Zero out all of the reserved backup group descriptor table blocks */
- ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
- GFP_NOFS);
- if (err)
- goto exit_bh;
+ ext4_mark_bitmap_end(group_data[i].blocks_count,
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
- ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
- input->block_bitmap - start);
- ext4_set_bit(input->block_bitmap - start, bh->b_data);
- ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
- input->inode_bitmap - start);
- ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
- /* Zero out all of the inode table blocks */
- block = input->inode_table;
- ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
- if (err)
- goto exit_bh;
+handle_ib:
+ if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+ continue;
+
+ /* Initialize inode bitmap of the @group */
+ block = group_data[i].inode_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+ /* Mark unused entries in inode bitmap used */
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
- if ((err = extend_or_restart_transaction(handle, 2, bh)))
- goto exit_bh;
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+ }
+ bh = NULL;
+
+ /* Mark group tables in block bitmap */
+ for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+ count = group_table_count[j];
+ start = (&group_data[0].block_bitmap)[j];
+ block = start;
+ for (i = 1; i < flex_gd->count; i++) {
+ block += group_table_count[j];
+ if (block == (&group_data[i].block_bitmap)[j]) {
+ count += group_table_count[j];
+ continue;
+ }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ count = group_table_count[j];
+ start = (&group_data[i].block_bitmap)[j];
+ block = start;
+ }
- ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
- bh->b_data);
- ext4_handle_dirty_metadata(handle, NULL, bh);
- brelse(bh);
- /* Mark unused entries in inode bitmap used */
- ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
- input->inode_bitmap, input->inode_bitmap - start);
- if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
+ if (count) {
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ }
}
- ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
- bh->b_data);
- ext4_handle_dirty_metadata(handle, NULL, bh);
-exit_bh:
+out:
brelse(bh);
-
-exit_journal:
- mutex_unlock(&sbi->s_resize_lock);
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
return err;
@@ -319,10 +694,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
* groups in current filesystem that have BACKUPS, or -ve error code.
*/
static int verify_reserved_gdb(struct super_block *sb,
+ ext4_group_t end,
struct buffer_head *primary)
{
const ext4_fsblk_t blk = primary->b_blocknr;
- const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
@@ -362,15 +737,15 @@ static int verify_reserved_gdb(struct super_block *sb,
* fail once we start modifying the data on disk, because JBD has no rollback.
*/
static int add_new_gdb(handle_t *handle, struct inode *inode,
- struct ext4_new_group_data *input,
- struct buffer_head **primary)
+ ext4_group_t group)
{
struct super_block *sb = inode->i_sb;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
struct buffer_head **o_group_desc, **n_group_desc;
struct buffer_head *dind;
+ struct buffer_head *gdb_bh;
int gdbackups;
struct ext4_iloc iloc;
__le32 *data;
@@ -393,11 +768,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
return -EPERM;
}
- *primary = sb_bread(sb, gdblock);
- if (!*primary)
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
return -EIO;
- if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
+ if (gdbackups < 0) {
err = gdbackups;
goto exit_bh;
}
@@ -412,30 +788,38 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
data = (__le32 *)dind->b_data;
if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
ext4_warning(sb, "new group %u GDT block %llu not reserved",
- input->group, gdblock);
+ group, gdblock);
err = -EINVAL;
goto exit_dind;
}
- if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (unlikely(err))
goto exit_dind;
- if ((err = ext4_journal_get_write_access(handle, *primary)))
- goto exit_sbh;
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ goto exit_dind;
- if ((err = ext4_journal_get_write_access(handle, dind)))
- goto exit_primary;
+ BUFFER_TRACE(dind, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dind);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
/* ext4_reserve_inode_write() gets a reference on the iloc */
- if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
- goto exit_dindj;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (unlikely(err))
+ goto exit_dind;
- n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
if (!n_group_desc) {
err = -ENOMEM;
- ext4_warning(sb,
- "not enough memory for %lu groups", gdb_num + 1);
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
goto exit_inode;
}
@@ -449,45 +833,89 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
* reserved inode, and will become GDT blocks (primary and backup).
*/
data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
- ext4_handle_dirty_metadata(handle, NULL, dind);
- brelse(dind);
+ err = ext4_handle_dirty_metadata(handle, NULL, dind);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
ext4_mark_iloc_dirty(handle, inode, &iloc);
- memset((*primary)->b_data, 0, sb->s_blocksize);
- ext4_handle_dirty_metadata(handle, NULL, *primary);
+ memset(gdb_bh->b_data, 0, sb->s_blocksize);
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
+ brelse(dind);
o_group_desc = EXT4_SB(sb)->s_group_desc;
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
- n_group_desc[gdb_num] = *primary;
+ n_group_desc[gdb_num] = gdb_bh;
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
- kfree(o_group_desc);
+ ext4_kvfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err)
+ ext4_std_error(sb, err);
- return 0;
+ return err;
exit_inode:
- /* ext4_journal_release_buffer(handle, iloc.bh); */
+ ext4_kvfree(n_group_desc);
brelse(iloc.bh);
-exit_dindj:
- /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
- /* ext4_journal_release_buffer(handle, *primary); */
-exit_sbh:
- /* ext4_journal_release_buffer(handle, *primary); */
exit_dind:
brelse(dind);
exit_bh:
- brelse(*primary);
+ brelse(gdb_bh);
ext4_debug("leaving with error %d\n", err);
return err;
}
/*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+ handle_t *handle, ext4_group_t group) {
+ ext4_fsblk_t gdblock;
+ struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int err;
+
+ gdblock = ext4_meta_bg_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
+ return -EIO;
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
+ return err;
+ }
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = gdb_bh;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ ext4_kvfree(o_group_desc);
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ brelse(gdb_bh);
+ return err;
+}
+
+/*
* Called when we are adding a new group which has a backup copy of each of
* the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
* We need to add these reserved backup GDT blocks to the resize inode, so
@@ -501,7 +929,7 @@ exit_bh:
* backup GDT blocks are stored in their reserved primary GDT block.
*/
static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
- struct ext4_new_group_data *input)
+ ext4_group_t group)
{
struct super_block *sb = inode->i_sb;
int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -545,7 +973,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
err = -EIO;
goto exit_bh;
}
- if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+ if (gdbackups < 0) {
brelse(primary[res]);
err = gdbackups;
goto exit_bh;
@@ -555,14 +984,9 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
- if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
- /*
- int j;
- for (j = 0; j < i; j++)
- ext4_journal_release_buffer(handle, primary[j]);
- */
+ BUFFER_TRACE(primary[i], "get_write_access");
+ if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
- }
}
if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
@@ -572,7 +996,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
* Finally we can add each of the reserved backup GDT blocks from
* the new group to its reserved primary GDT block.
*/
- blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+ blk = group * EXT4_BLOCKS_PER_GROUP(sb);
for (i = 0; i < reserved_gdb; i++) {
int err2;
data = (__le32 *)primary[i]->b_data;
@@ -614,29 +1038,38 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb,
- int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+ int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const ext4_group_t last = sbi->s_groups_count;
+ ext4_group_t last;
const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
- ext4_group_t group;
+ ext4_group_t group = 0;
int rest = sb->s_blocksize - size;
handle_t *handle;
int err = 0, err2;
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle)) {
group = 1;
err = PTR_ERR(handle);
goto exit_err;
}
- while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ if (meta_bg == 0) {
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ last = sbi->s_groups_count;
+ } else {
+ group = ext4_meta_bg_first_group(sb, group) + 1;
+ last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+ }
+
+ while (group < sbi->s_groups_count) {
struct buffer_head *bh;
+ ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
if (ext4_handle_valid(handle) &&
@@ -645,13 +1078,21 @@ static void update_backups(struct super_block *sb,
(err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
break;
- bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (meta_bg == 0)
+ backup_block = group * bpg + blk_off;
+ else
+ backup_block = (ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group));
+
+ bh = sb_getblk(sb, backup_block);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
- ext4_debug("update metadata backup %#04lx\n",
- (unsigned long)bh->b_blocknr);
+ ext4_debug("update metadata backup %llu(+%llu)\n",
+ backup_block, backup_block -
+ ext4_group_first_block_no(sb, group));
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -660,8 +1101,17 @@ static void update_backups(struct super_block *sb,
memset(bh->b_data + size, 0, rest);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ext4_handle_dirty_metadata(handle, NULL, bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
brelse(bh);
+
+ if (meta_bg == 0)
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ else if (group == last)
+ break;
+ else
+ group = last;
}
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
@@ -686,6 +1136,424 @@ exit_err:
}
}
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+ ext4_group_t group, struct inode *resize_inode,
+ ext4_group_t count)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *gdb_bh;
+ int i, gdb_off, gdb_num, err = 0;
+ int meta_bg;
+
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ for (i = 0; i < count; i++, group++) {
+ int reserved_gdb = ext4_bg_has_super(sb, group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+
+ if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+ err = reserve_backup_gdb(handle, resize_inode, group);
+ } else if (meta_bg != 0) {
+ err = add_new_gdb_meta_bg(sb, handle, group);
+ } else {
+ err = add_new_gdb(handle, resize_inode, group);
+ }
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+ struct buffer_head *bh = sb_getblk(sb, block);
+ if (unlikely(!bh))
+ return NULL;
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ brelse(bh);
+ return NULL;
+ }
+ }
+
+ return bh;
+}
+
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+ ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct ext4_new_group_data *group_data)
+{
+ struct buffer_head *bh;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ brelse(bh);
+
+ bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_block_bitmap_csum_set(sb, group, gdp, bh);
+ brelse(bh);
+
+ return 0;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *gdb_bh;
+ ext4_group_t group;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ int i, gdb_off, gdb_num, err = 0;
+
+
+ for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+ group = group_data->group;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+ */
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
+
+ memset(gdp, 0, EXT4_DESC_SIZE(sb));
+ ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+ ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+ if (err) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
+ ext4_inode_table_set(sb, gdp, group_data->inode_table);
+ ext4_free_group_clusters_set(sb, gdp,
+ EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ if (ext4_has_group_desc_csum(sb))
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags = cpu_to_le16(*bg_flags);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
+ /*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_groupinfo(sb, group, gdp);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ ext4_fsblk_t blocks_count = 0;
+ ext4_fsblk_t free_blocks = 0;
+ ext4_fsblk_t reserved_blocks = 0;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+ /*
+ * Make the new blocks and inodes valid next. We do this before
+ * increasing the group count so that once the group is enabled,
+ * all of its blocks and inodes are already valid.
+ *
+ * We always allocate group-by-group, then block-by-block or
+ * inode-by-inode within a group, so enabling these
+ * blocks/inodes before the group is live won't actually let us
+ * allocate the new space yet.
+ */
+ for (i = 0; i < flex_gd->count; i++) {
+ blocks_count += group_data[i].blocks_count;
+ free_blocks += group_data[i].free_blocks_count;
+ }
+
+ reserved_blocks = ext4_r_blocks_count(es) * 100;
+ reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks *= blocks_count;
+ do_div(reserved_blocks, 100);
+
+ ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+ le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+ le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+
+ ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
+ /*
+ * We need to protect s_groups_count against other CPUs seeing
+ * inconsistent state in the superblock.
+ *
+ * The precise rules we use are:
+ *
+ * * Writers must perform a smp_wmb() after updating all
+ * dependent data and before modifying the groups count
+ *
+ * * Readers must perform an smp_rmb() after reading the groups
+ * count and before reading any dependent data.
+ *
+ * NB. These rules can be relaxed when checking the group count
+ * while freeing data, as we can only allocate from a block
+ * group after serialising against the group count, and we can
+ * only then free after serialising in turn against that
+ * allocation.
+ */
+ smp_wmb();
+
+ /* Update the global fs size fields */
+ sbi->s_groups_count += flex_gd->count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+
+ /* Update the reserved block counts only once the new group is
+ * active. */
+ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+ reserved_blocks);
+
+ /* Update the free space counts */
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, free_blocks));
+ percpu_counter_add(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+ ext4_debug("free blocks count %llu",
+ percpu_counter_read(&sbi->s_freeclusters_counter));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, group_data[0].group);
+ atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ }
+
+ /*
+ * Update the fs overhead information
+ */
+ ext4_calculate_overhead(sb);
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+ "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+ blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+ struct inode *resize_inode,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_group_t group;
+ handle_t *handle;
+ unsigned reserved_gdb;
+ int err = 0, err2 = 0, credit;
+
+ BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ o_blocks_count = ext4_blocks_count(es);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+
+ err = setup_new_flex_group_blocks(sb, flex_gd);
+ if (err)
+ goto exit;
+ /*
+ * We will always be modifying at least the superblock and GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ credit = flex_gd->count * 4 + reserved_gdb;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto exit_journal;
+
+ group = flex_gd->groups[0].group;
+ BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+ err = ext4_add_new_descs(handle, sb, group,
+ resize_inode, flex_gd->count);
+ if (err)
+ goto exit_journal;
+
+ err = ext4_setup_new_descs(handle, sb, flex_gd);
+ if (err)
+ goto exit_journal;
+
+ ext4_update_super(sb, flex_gd);
+
+ err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+ err2 = ext4_journal_stop(handle);
+ if (!err)
+ err = err2;
+
+ if (!err) {
+ int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int gdb_num_end = ((group + flex_gd->count - 1) /
+ EXT4_DESC_PER_BLOCK(sb));
+ int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG);
+ sector_t old_gdb = 0;
+
+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block), 0);
+ for (; gdb_num <= gdb_num_end; gdb_num++) {
+ struct buffer_head *gdb_bh;
+
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ if (old_gdb == gdb_bh->b_blocknr)
+ continue;
+ update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+ gdb_bh->b_size, meta_bg);
+ old_gdb = gdb_bh->b_blocknr;
+ }
+ }
+exit:
+ return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t n_blocks_count,
+ unsigned long flexbg_size)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t n_group;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ ext4_grpblk_t last;
+ ext4_grpblk_t blocks_per_group;
+ unsigned long i;
+
+ blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (o_blocks_count == n_blocks_count)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+ last_group = group | (flexbg_size - 1);
+ if (last_group > n_group)
+ last_group = n_group;
+
+ flex_gd->count = last_group - group + 1;
+
+ for (i = 0; i < flex_gd->count; i++) {
+ int overhead;
+
+ group_data[i].group = group + i;
+ group_data[i].blocks_count = blocks_per_group;
+ overhead = ext4_group_overhead_blocks(sb, group + i);
+ group_data[i].free_blocks_count = blocks_per_group - overhead;
+ if (ext4_has_group_desc_csum(sb)) {
+ flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+ EXT4_BG_INODE_UNINIT;
+ if (!test_opt(sb, INIT_INODE_TABLE))
+ flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+ } else
+ flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+ }
+
+ if (last_group == n_group && ext4_has_group_desc_csum(sb))
+ /* We need to initialize block bitmap of last group. */
+ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+ group_data[i - 1].blocks_count = last + 1;
+ group_data[i - 1].free_blocks_count -= blocks_per_group-
+ last - 1;
+ }
+
+ return 1;
+}
+
/* Add group descriptor data to an existing or new group descriptor block.
* Ensure we handle all possible error conditions _before_ we start modifying
* the filesystem, because we cannot abort the transaction and not have it
@@ -701,18 +1569,16 @@ exit_err:
*/
int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
{
+ struct ext4_new_flex_group_data flex_gd;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
- struct buffer_head *primary = NULL;
- struct ext4_group_desc *gdp;
struct inode *inode = NULL;
- handle_t *handle;
- int gdb_off, gdb_num;
- int err, err2;
+ int gdb_off;
+ int err;
+ __u16 bg_flags = 0;
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -749,174 +1615,79 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
- if ((err = verify_group_input(sb, input)))
- goto exit_put;
+ err = verify_group_input(sb, input);
+ if (err)
+ goto out;
+
+ err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+ if (err)
+ goto out;
- if ((err = setup_new_group_blocks(sb, input)))
- goto exit_put;
+ err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+ if (err)
+ goto out;
- /*
- * We will always be modifying at least the superblock and a GDT
- * block. If we are adding a group past the last current GDT block,
- * we will also modify the inode and the dindirect block. If we
- * are adding a group with superblock/GDT backups we will also
- * modify each of the reserved GDT dindirect blocks.
+ flex_gd.count = 1;
+ flex_gd.groups = input;
+ flex_gd.bg_flags = &bg_flags;
+ err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+ iput(inode);
+ return err;
+} /* ext4_group_add */
+
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+ ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ handle_t *handle;
+ int err = 0, err2;
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_group_add_blocks().
*/
- handle = ext4_journal_start_sb(sb,
- ext4_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto exit_put;
+ ext4_warning(sb, "error %d on journal start", err);
+ return err;
}
- mutex_lock(&sbi->s_resize_lock);
- if (input->group != sbi->s_groups_count) {
- ext4_warning(sb, "multiple resizers run on filesystem!");
- err = -EBUSY;
- goto exit_journal;
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) {
+ ext4_warning(sb, "error %d on journal write access", err);
+ goto errout;
}
- if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
- goto exit_journal;
-
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
- if (gdb_off) {
- primary = sbi->s_group_desc[gdb_num];
- if ((err = ext4_journal_get_write_access(handle, primary)))
- goto exit_journal;
-
- if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
- (err = reserve_backup_gdb(handle, inode, input)))
- goto exit_journal;
- } else if ((err = add_new_gdb(handle, inode, input, &primary)))
- goto exit_journal;
-
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * We do not lock all allocations via s_resize_lock
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
-
- /* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)((char *)primary->b_data +
- gdb_off * EXT4_DESC_SIZE(sb));
-
- memset(gdp, 0, EXT4_DESC_SIZE(sb));
- ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
- ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
- ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- ext4_free_blks_set(sb, gdp, input->free_blocks_count);
- ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
- gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
- /*
- * We can allocate memory for mb_alloc based on the new group
- * descriptor
- */
- err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+ ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
if (err)
- goto exit_journal;
-
- /*
- * Make the new blocks and inodes valid next. We do this before
- * increasing the group count so that once the group is enabled,
- * all of its blocks and inodes are already valid.
- *
- * We always allocate group-by-group, then block-by-block or
- * inode-by-inode within a group, so enabling these
- * blocks/inodes before the group is live won't actually let us
- * allocate the new space yet.
- */
- ext4_blocks_count_set(es, ext4_blocks_count(es) +
- input->blocks_count);
- le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
- /*
- * We need to protect s_groups_count against other CPUs seeing
- * inconsistent state in the superblock.
- *
- * The precise rules we use are:
- *
- * * Writers of s_groups_count *must* hold s_resize_lock
- * AND
- * * Writers must perform a smp_wmb() after updating all dependent
- * data and before modifying the groups count
- *
- * * Readers must hold s_resize_lock over the access
- * OR
- * * Readers must perform an smp_rmb() after reading the groups count
- * and before reading any dependent data.
- *
- * NB. These rules can be relaxed when checking the group count
- * while freeing data, as we can only allocate from a block
- * group after serialising against the group count, and we can
- * only then free after serialising in turn against that
- * allocation.
- */
- smp_wmb();
-
- /* Update the global fs size fields */
- sbi->s_groups_count++;
-
- ext4_handle_dirty_metadata(handle, NULL, primary);
-
- /* Update the reserved block counts only once the new group is
- * active. */
- ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
- input->reserved_blocks);
-
- /* Update the free space counts */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- input->free_blocks_count);
- percpu_counter_add(&sbi->s_freeinodes_counter,
- EXT4_INODES_PER_GROUP(sb));
-
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group;
- flex_group = ext4_flex_group(sbi, input->group);
- atomic_add(input->free_blocks_count,
- &sbi->s_flex_groups[flex_group].free_blocks);
- atomic_add(EXT4_INODES_PER_GROUP(sb),
- &sbi->s_flex_groups[flex_group].free_inodes);
- }
-
+ goto errout;
ext4_handle_dirty_super(handle, sb);
-
-exit_journal:
- mutex_unlock(&sbi->s_resize_lock);
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+errout:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
+
if (!err) {
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
- update_backups(sb, primary->b_blocknr, primary->b_data,
- primary->b_size);
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+ "blocks\n", ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ (char *)es, sizeof(struct ext4_super_block), 0);
}
-exit_put:
- iput(inode);
return err;
-} /* ext4_group_add */
+}
/*
* Extend the filesystem to the new number of blocks specified. This entry
@@ -935,26 +1706,23 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- handle_t *handle;
int err;
ext4_group_t group;
- /* We don't need to worry about locking wrt other resizers just
- * yet: we're going to revalidate es->s_blocks_count after
- * taking the s_resize_lock below. */
o_blocks_count = ext4_blocks_count(es);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
- o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_DEBUG,
+ "extending last group from %llu to %llu blocks",
+ o_blocks_count, n_blocks_count);
if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
return 0;
if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT4-fs: filesystem on %s:"
- " too large to resize to %llu blocks safely\n",
- sb->s_id, n_blocks_count);
+ ext4_msg(sb, KERN_ERR,
+ "filesystem too large to resize to %llu blocks safely",
+ n_blocks_count);
if (sizeof(sector_t) < 8)
ext4_warning(sb, "CONFIG_LBDAF not enabled");
return -EINVAL;
@@ -962,7 +1730,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
if (n_blocks_count < o_blocks_count) {
ext4_warning(sb, "can't shrink FS - resize aborted");
- return -EBUSY;
+ return -EINVAL;
}
/* Handle the remaining blocks in the last group only. */
@@ -995,49 +1763,258 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- /* We will update the superblock, one block bitmap, and
- * one group descriptor via ext4_free_blocks().
- */
- handle = ext4_journal_start_sb(sb, 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- ext4_warning(sb, "error %d on journal start", err);
- goto exit_put;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ return err;
+} /* ext4_group_extend */
+
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+ return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t nr;
+ int i, ret, err = 0;
+ int credits = 1;
+
+ ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+ if (inode) {
+ if (es->s_reserved_gdt_blocks) {
+ ext4_error(sb, "Unexpected non-zero "
+ "s_reserved_gdt_blocks");
+ return -EPERM;
+ }
+
+ /* Do a quick sanity check of the resize inode */
+ if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ goto invalid_resize_inode;
+ for (i = 0; i < EXT4_N_BLOCKS; i++) {
+ if (i == EXT4_DIND_BLOCK) {
+ if (ei->i_data[i])
+ continue;
+ else
+ goto invalid_resize_inode;
+ }
+ if (ei->i_data[i])
+ goto invalid_resize_inode;
+ }
+ credits += 3; /* block bitmap, bg descriptor, resize inode */
}
- mutex_lock(&EXT4_SB(sb)->s_resize_lock);
- if (o_blocks_count != ext4_blocks_count(es)) {
- ext4_warning(sb, "multiple resizers run on filesystem!");
- mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
- ext4_journal_stop(handle);
- err = -EBUSY;
- goto exit_put;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto errout;
+
+ EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ sbi->s_es->s_first_meta_bg =
+ cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto errout;
}
- if ((err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh))) {
- ext4_warning(sb, "error %d on journal write access", err);
- mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
- ext4_journal_stop(handle);
- goto exit_put;
+ if (inode) {
+ nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ ei->i_data[EXT4_DIND_BLOCK] = 0;
+ inode->i_blocks = 0;
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ ext4_std_error(sb, err);
}
- ext4_blocks_count_set(es, o_blocks_count + add);
- mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
- ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- /* We add the blocks to the bitmap and set the group need init bit */
- ext4_add_groupblocks(handle, sb, o_blocks_count, add);
- ext4_handle_dirty_super(handle, sb);
- ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- if ((err = ext4_journal_stop(handle)))
- goto exit_put;
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
- ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
-exit_put:
+errout:
+ ret = ext4_journal_stop(handle);
+ if (!err)
+ err = ret;
+ return ret;
+
+invalid_resize_inode:
+ ext4_error(sb, "corrupted/inconsistent resize inode");
+ return -EINVAL;
+}
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+ struct ext4_new_flex_group_data *flex_gd = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *bh;
+ struct inode *resize_inode = NULL;
+ ext4_grpblk_t add, offset;
+ unsigned long n_desc_blocks;
+ unsigned long o_desc_blocks;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_fsblk_t o_blocks_count;
+ ext4_fsblk_t n_blocks_count_retry = 0;
+ unsigned long last_update_time = 0;
+ int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int meta_bg;
+
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+retry:
+ o_blocks_count = ext4_blocks_count(es);
+
+ ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count < o_blocks_count) {
+ /* On-line shrinking not supported */
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
+ }
+
+ if (n_blocks_count == o_blocks_count)
+ /* Nothing need to do */
+ return 0;
+
+ n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+ if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ ext4_warning(sb, "resize would cause inodes_count overflow");
+ return -EINVAL;
+ }
+ ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+ n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+ o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
+
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (meta_bg) {
+ ext4_error(sb, "resize_inode and meta_bg enabled "
+ "simultaneously");
+ return -EINVAL;
+ }
+ if (n_desc_blocks > o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ n_blocks_count_retry = n_blocks_count;
+ n_desc_blocks = o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+ n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+ n_group--; /* set to last group number */
+ }
+
+ if (!resize_inode)
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
+ }
+
+ if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ err = ext4_convert_meta_bg(sb, resize_inode);
+ if (err)
+ goto out;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
+ if (n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ goto retry;
+ }
+ }
+
+ /* extend the last group */
+ if (n_group == o_group)
+ add = n_blocks_count - o_blocks_count;
+ else
+ add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ if (add > 0) {
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ if (err)
+ goto out;
+ }
+
+ if (ext4_blocks_count(es) == n_blocks_count)
+ goto out;
+
+ err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+ if (err)
+ goto out;
+
+ flex_gd = alloc_flex_gd(flexbg_size);
+ if (flex_gd == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Add flex groups. Note that a regular group is a
+ * flex group with 1 group.
+ */
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+ flexbg_size)) {
+ if (jiffies - last_update_time > HZ * 10) {
+ if (last_update_time)
+ ext4_msg(sb, KERN_INFO,
+ "resized to %llu blocks",
+ ext4_blocks_count(es));
+ last_update_time = jiffies;
+ }
+ if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+ break;
+ err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+ if (unlikely(err))
+ break;
+ }
+
+ if (!err && n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ free_flex_gd(flex_gd);
+ flex_gd = NULL;
+ goto retry;
+ }
+
+out:
+ if (flex_gd)
+ free_flex_gd(flex_gd);
+ if (resize_inode != NULL)
+ iput(resize_inode);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe6254..6df7bc611db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,12 +38,14 @@
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include "ext4.h"
+#include "ext4_extents.h" /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
@@ -54,29 +56,50 @@
static struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
-struct ext4_lazy_init *ext4_li_info;
-struct mutex ext4_li_mtx;
-struct ext4_features *ext4_feat;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static int ext4_commit_super(struct super_block *sb, int sync);
static void ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16]);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
-static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext2");
+MODULE_ALIAS("ext2");
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = {
@@ -86,11 +109,85 @@ static struct file_system_type ext3_fs_type = {
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("ext3");
+MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
#else
#define IS_EXT3_SB(sb) (0)
#endif
+static int ext4_verify_csum_type(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct ext4_super_block, s_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
+}
+
+void ext4_kvfree(void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+
+}
+
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
@@ -115,8 +212,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}
-__u32 ext4_free_blks_count(struct super_block *sb,
- struct ext4_group_desc *bg)
+__u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg)
{
return le16_to_cpu(bg->bg_free_blocks_count_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -171,8 +268,8 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}
-void ext4_free_blks_set(struct super_block *sb,
- struct ext4_group_desc *bg, __u32 count)
+void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
{
bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -204,116 +301,6 @@ void ext4_itable_unused_set(struct super_block *sb,
}
-/* Just increment the non-pointer handle value */
-static handle_t *ext4_get_nojournal(void)
-{
- handle_t *handle = current->journal_info;
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
-
- ref_cnt++;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
- return handle;
-}
-
-
-/* Decrement the non-pointer handle value */
-static void ext4_put_nojournal(handle_t *handle)
-{
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt == 0);
-
- ref_cnt--;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
-}
-
-/*
- * Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
-{
- journal_t *journal;
-
- if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
- /* Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly. */
- journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
- return jbd2_journal_start(journal, nblocks);
- }
- return ext4_get_nojournal();
-}
-
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
-{
- struct super_block *sb;
- int err;
- int rc;
-
- if (!ext4_handle_valid(handle)) {
- ext4_put_nojournal(handle);
- return 0;
- }
- sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
- rc = jbd2_journal_stop(handle);
-
- if (!err)
- err = rc;
- if (err)
- __ext4_std_error(sb, where, line, err);
- return err;
-}
-
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn, struct buffer_head *bh,
- handle_t *handle, int err)
-{
- char nbuf[16];
- const char *errstr = ext4_decode_error(NULL, err, nbuf);
-
- BUG_ON(!ext4_handle_valid(handle));
-
- if (bh)
- BUFFER_TRACE(bh, "abort");
-
- if (!handle->h_err)
- handle->h_err = err;
-
- if (is_handle_aborted(handle))
- return;
-
- printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
- caller, line, errstr, err_fn);
-
- jbd2_journal_abort_handle(handle);
-}
-
static void __save_error_info(struct super_block *sb, const char *func,
unsigned int line)
{
@@ -338,7 +325,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
*/
if (!es->s_error_count)
mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
- es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+ le32_add_cpu(&es->s_error_count, 1);
}
static void save_error_info(struct super_block *sb, const char *func,
@@ -348,6 +335,41 @@ static void save_error_info(struct super_block *sb, const char *func,
ext4_commit_super(sb, 1);
}
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else. Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+ struct inode *bd_inode = sb->s_bdev->bd_inode;
+ struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+ return bdi->dev == NULL;
+}
+
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int error = is_journal_aborted(journal);
+ struct ext4_journal_cb_entry *jce;
+
+ BUG_ON(txn->t_state == T_FINISHED);
+ spin_lock(&sbi->s_md_lock);
+ while (!list_empty(&txn->t_private_list)) {
+ jce = list_entry(txn->t_private_list.next,
+ struct ext4_journal_cb_entry, jce_list);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ jce->jce_func(sb, jce, error);
+ spin_lock(&sbi->s_md_lock);
+ }
+ spin_unlock(&sbi->s_md_lock);
+}
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
@@ -378,6 +400,11 @@ static void ext4_handle_error(struct super_block *sb)
}
if (test_opt(sb, ERRORS_RO)) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC))
@@ -385,73 +412,98 @@ static void ext4_handle_error(struct super_block *sb)
sb->s_id);
}
+#define ext4_error_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \
+ "EXT4-fs error")
+
void __ext4_error(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
- sb->s_id, function, line, current->comm);
- vprintk(fmt, args);
- printk("\n");
- va_end(args);
-
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+ sb->s_id, function, line, current->comm, &vaf);
+ va_end(args);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
-void ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
+ struct va_format vaf;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
es->s_last_error_block = cpu_to_le64(block);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
- inode->i_sb->s_id, function, line, inode->i_ino);
- if (block)
- printk("block %llu: ", block);
- printk("comm %s: ", current->comm);
- vprintk(fmt, args);
- printk("\n");
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
-void ext4_error_file(struct file *file, const char *function,
- unsigned int line, const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
+ struct va_format vaf;
struct ext4_super_block *es;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
char pathname[80], *path;
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- va_start(args, fmt);
- path = d_path(&(file->f_path), pathname, sizeof(pathname));
- if (!path)
- path = "(unknown)";
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu "
- "(comm %s path %s): ",
- inode->i_sb->s_id, function, line, inode->i_ino,
- current->comm, path);
- vprintk(fmt, args);
- printk("\n");
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16])
+const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16])
{
char *errstr = NULL;
@@ -500,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function,
(sb->s_flags & MS_RDONLY))
return;
- errstr = ext4_decode_error(sb, errno, nbuf);
- printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
- save_error_info(sb, function, line);
+ if (ext4_error_ratelimit(sb)) {
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -533,8 +587,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
if ((sb->s_flags & MS_RDONLY) == 0) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -543,28 +602,37 @@ void __ext4_abort(struct super_block *sb, const char *function,
panic("EXT4-fs panic from previous error\n");
}
-void ext4_msg (struct super_block * sb, const char *prefix,
- const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
+ return;
+
va_start(args, fmt);
- printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
- vprintk(fmt, args);
- printk("\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
va_end(args);
}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning"))
+ return;
+
va_start(args, fmt);
- printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
- sb->s_id, function, line);
- vprintk(fmt, args);
- printk("\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+ sb->s_id, function, line, &vaf);
va_end(args);
}
@@ -575,22 +643,28 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
__releases(bitlock)
__acquires(bitlock)
{
+ struct va_format vaf;
va_list args;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
es->s_last_error_ino = cpu_to_le32(ino);
es->s_last_error_block = cpu_to_le64(block);
__save_error_info(sb, function, line);
- va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
- sb->s_id, function, line, grp);
- if (ino)
- printk("inode %lu: ", ino);
- if (block)
- printk("block %llu:", (unsigned long long) block);
- vprintk(fmt, args);
- printk("\n");
- va_end(args);
+
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+ sb->s_id, function, line, grp);
+ if (ino)
+ printk(KERN_CONT "inode %lu: ", ino);
+ if (block)
+ printk(KERN_CONT "block %llu:",
+ (unsigned long long) block);
+ printk(KERN_CONT "%pV\n", &vaf);
+ va_end(args);
+ }
if (test_opt(sb, ERRORS_CONT)) {
ext4_commit_super(sb, 0);
@@ -605,7 +679,7 @@ __acquires(bitlock)
* filesystem will have already been marked read/only and the
* journal has been aborted. We return 1 as a hint to callers
* who might what to use the return value from
- * ext4_grp_locked_error() to distinguish beween the
+ * ext4_grp_locked_error() to distinguish between the
* ERRORS_CONT and ERRORS_RO case, and perhaps return more
* aggressively from the ext4 function in question, with a
* more appropriate error code.
@@ -647,7 +721,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
@@ -661,23 +735,19 @@ fail:
/*
* Release the journal device
*/
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext4_blkdev_put(bdev);
+ ext4_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -712,12 +782,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->dio_unwritten_wq);
- destroy_workqueue(sbi->dio_unwritten_wq);
-
- lock_super(sb);
- if (sb->s_dirt)
- ext4_commit_super(sb, 1);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
@@ -726,7 +792,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
- del_timer(&sbi->s_err_report);
+ ext4_es_unregister_shrinker(sbi);
+ del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
@@ -735,24 +802,25 @@ static void ext4_put_super(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
- ext4_commit_super(sb, 1);
}
+ if (!(sb->s_flags & MS_RDONLY))
+ ext4_commit_super(sb, 1);
+
if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
kobject_del(&sbi->s_kobj);
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
- if (is_vmalloc_addr(sbi->s_flex_groups))
- vfree(sbi->s_flex_groups);
- else
- kfree(sbi->s_flex_groups);
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ ext4_kvfree(sbi->s_group_desc);
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -778,14 +846,21 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
/*
* Now that we are completely done shutting down the
* superblock, we need to actually destroy the kobject.
*/
- unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
}
@@ -804,31 +879,31 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- ei->vfs_inode.i_data.writeback_index = 0;
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
- /*
- * Note: We can be called before EXT4_SB(sb)->s_journal is set,
- * therefore it can be null here. Don't check it, just initialize
- * jinode.
- */
- jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
- ei->i_delalloc_reserved_flag = 0;
+ ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
#endif
- INIT_LIST_HEAD(&ei->i_completed_io_list);
+ ei->jinode = NULL;
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
+ atomic_set(&ei->i_unwritten, 0);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
return &ei->vfs_inode;
}
@@ -841,9 +916,14 @@ static int ext4_drop_inode(struct inode *inode)
return drop;
}
+static void ext4_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
static void ext4_destroy_inode(struct inode *inode)
{
- ext4_ioend_wait(inode);
if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): orphan list check failed!",
@@ -853,7 +933,7 @@ static void ext4_destroy_inode(struct inode *inode)
true);
dump_stack();
}
- kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+ call_rcu(&inode->i_rcu, ext4_i_callback);
}
static void init_once(void *foo)
@@ -861,14 +941,12 @@ static void init_once(void *foo)
struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4_FS_XATTR
init_rwsem(&ei->xattr_sem);
-#endif
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
@@ -882,195 +960,28 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(ext4_inode_cachep);
}
void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
- end_writeback(inode);
+ clear_inode(inode);
dquot_drop(inode);
ext4_discard_preallocations(inode);
- if (EXT4_JOURNAL(inode))
- jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
- &EXT4_I(inode)->jinode);
-}
-
-static inline void ext4_show_quota_options(struct seq_file *seq,
- struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_jquota_fmt) {
- char *fmtname = "";
-
- switch (sbi->s_jquota_fmt) {
- case QFMT_VFS_OLD:
- fmtname = "vfsold";
- break;
- case QFMT_VFS_V0:
- fmtname = "vfsv0";
- break;
- case QFMT_VFS_V1:
- fmtname = "vfsv1";
- break;
- }
- seq_printf(seq, ",jqfmt=%s", fmtname);
- }
-
- if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
- if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
-#endif
-}
-
-/*
- * Show an option if
- * - it's set to a non-default value OR
- * - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
-{
- int def_errors;
- unsigned long def_mount_opts;
- struct super_block *sb = vfs->mnt_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
-
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- def_errors = le16_to_cpu(es->s_errors);
-
- if (sbi->s_sb_block != 1)
- seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
- if (test_opt(sb, MINIX_DF))
- seq_puts(seq, ",minixdf");
- if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",grpid");
- if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT4_DEF_RESUID ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
- }
- if (sbi->s_resgid != EXT4_DEF_RESGID ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
- }
- if (test_opt(sb, ERRORS_RO)) {
- if (def_errors == EXT4_ERRORS_PANIC ||
- def_errors == EXT4_ERRORS_CONTINUE) {
- seq_puts(seq, ",errors=remount-ro");
- }
- }
- if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
- seq_puts(seq, ",errors=continue");
- if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
- seq_puts(seq, ",errors=panic");
- if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
- seq_puts(seq, ",nouid32");
- if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
- seq_puts(seq, ",debug");
- if (test_opt(sb, OLDALLOC))
- seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4_FS_XATTR
- if (test_opt(sb, XATTR_USER) &&
- !(def_mount_opts & EXT4_DEFM_XATTR_USER))
- seq_puts(seq, ",user_xattr");
- if (!test_opt(sb, XATTR_USER) &&
- (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
- seq_puts(seq, ",nouser_xattr");
- }
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",acl");
- if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",noacl");
-#endif
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- seq_printf(seq, ",commit=%u",
- (unsigned) (sbi->s_commit_interval / HZ));
- }
- if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
- seq_printf(seq, ",min_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode);
+ jbd2_free_inode(EXT4_I(inode)->jinode);
+ EXT4_I(inode)->jinode = NULL;
}
- if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
- seq_printf(seq, ",max_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
- }
-
- /*
- * We're changing the default of barrier mount option, so
- * let's always display its mount state so it's clear what its
- * status is.
- */
- seq_puts(seq, ",barrier=");
- seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
- seq_puts(seq, ",journal_async_commit");
- else if (test_opt(sb, JOURNAL_CHECKSUM))
- seq_puts(seq, ",journal_checksum");
- if (test_opt(sb, I_VERSION))
- seq_puts(seq, ",i_version");
- if (!test_opt(sb, DELALLOC) &&
- !(def_mount_opts & EXT4_DEFM_NODELALLOC))
- seq_puts(seq, ",nodelalloc");
-
- if (sbi->s_stripe)
- seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
- /*
- * journal mode get enabled in different ways
- * So just print the value even if we didn't specify it
- */
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
- if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
- seq_printf(seq, ",inode_readahead_blks=%u",
- sbi->s_inode_readahead_blks);
-
- if (test_opt(sb, DATA_ERR_ABORT))
- seq_puts(seq, ",data_err=abort");
-
- if (test_opt(sb, NO_AUTO_DA_ALLOC))
- seq_puts(seq, ",noauto_da_alloc");
-
- if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
- seq_puts(seq, ",discard");
-
- if (test_opt(sb, NOLOAD))
- seq_puts(seq, ",norecovery");
-
- if (test_opt(sb, DIOREAD_NOLOCK))
- seq_puts(seq, ",dioread_nolock");
-
- if (test_opt(sb, BLOCK_VALIDITY) &&
- !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
- seq_puts(seq, ",block_validity");
-
- if (!test_opt(sb, INIT_INODE_TABLE))
- seq_puts(seq, ",noinit_inode_table");
- else if (sbi->s_li_wait_mult)
- seq_printf(seq, ",init_inode_table=%u",
- (unsigned) sbi->s_li_wait_mult);
-
- ext4_show_quota_options(seq, sb);
-
- return 0;
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1146,18 +1057,22 @@ static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *path);
+ struct path *path);
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id);
static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags);
+static int ext4_enable_quotas(struct super_block *sb);
static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
-#endif
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
@@ -1176,6 +1091,16 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
};
+
+static const struct quotactl_ops ext4_qctl_sysfile_operations = {
+ .quota_on_meta = ext4_quota_on_sysfile,
+ .quota_off = ext4_quota_off_sysfile,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
+};
#endif
static const struct super_operations ext4_sops = {
@@ -1197,7 +1122,6 @@ static const struct super_operations ext4_sops = {
.quota_write = ext4_quota_write,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
- .trim_fs = ext4_trim_fs
};
static const struct super_operations ext4_nojournal_sops = {
@@ -1207,7 +1131,7 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
- .write_super = ext4_write_super,
+ .sync_fs = ext4_sync_fs_nojournal,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1228,24 +1152,23 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
- Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
- Opt_journal_update, Opt_journal_dev,
- Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+ Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
- Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
- Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_block_validity, Opt_noblock_validity,
+ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
- Opt_discard, Opt_nodiscard,
- Opt_init_inode_table, Opt_noinit_inode_table,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb,
};
static const match_table_t tokens = {
@@ -1263,21 +1186,21 @@ static const match_table_t tokens = {
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
{Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
+ {Opt_removed, "oldalloc"},
+ {Opt_removed, "orlov"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
- {Opt_noload, "noload"},
{Opt_noload, "norecovery"},
- {Opt_nobh, "nobh"},
- {Opt_bh, "bh"},
+ {Opt_noload, "noload"},
+ {Opt_removed, "nobh"},
+ {Opt_removed, "bh"},
{Opt_commit, "commit=%u"},
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
- {Opt_journal_update, "journal=update"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
@@ -1302,9 +1225,10 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
- {Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
+ {Opt_removed, "mblk_io_submit"},
+ {Opt_removed, "nomblk_io_submit"},
{Opt_block_validity, "block_validity"},
{Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1316,9 +1240,15 @@ static const match_table_t tokens = {
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
- {Opt_init_inode_table, "init_itable=%u"},
- {Opt_init_inode_table, "init_itable"},
- {Opt_noinit_inode_table, "noinit_itable"},
+ {Opt_init_itable, "init_itable=%u"},
+ {Opt_init_itable, "init_itable"},
+ {Opt_noinit_itable, "noinit_itable"},
+ {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_removed, "check=none"}, /* mount option from ext2/3 */
+ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
+ {Opt_removed, "reservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
{Opt_err, NULL},
};
@@ -1354,37 +1284,46 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *qname;
+ int ret = -1;
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
ext4_msg(sb, KERN_ERR,
"Cannot change journaled "
"quota options when quota turned on");
- return 0;
+ return -1;
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
}
qname = match_strdup(args);
if (!qname) {
ext4_msg(sb, KERN_ERR,
"Not enough memory for storing quotafile name");
- return 0;
+ return -1;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext4_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- return 0;
+ goto errout;
}
- set_opt(sbi->s_mount_opt, QUOTA);
+ sbi->s_qf_names[qtype] = qname;
+ set_opt(sb, QUOTA);
return 1;
+errout:
+ kfree(qname);
+ return ret;
}
static int clear_qf_name(struct super_block *sb, int qtype)
@@ -1396,427 +1335,371 @@ static int clear_qf_name(struct super_block *sb, int qtype)
sbi->s_qf_names[qtype]) {
ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
" when quota turned on");
- return 0;
+ return -1;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
+ kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 1;
}
#endif
+#define MOPT_SET 0x0001
+#define MOPT_CLEAR 0x0002
+#define MOPT_NOSUPPORT 0x0004
+#define MOPT_EXPLICIT 0x0008
+#define MOPT_CLEAR_ERR 0x0010
+#define MOPT_GTE0 0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q 0
+#define MOPT_QFMT 0x0040
+#else
+#define MOPT_Q MOPT_NOSUPPORT
+#define MOPT_QFMT MOPT_NOSUPPORT
+#endif
+#define MOPT_DATAJ 0x0080
+#define MOPT_NO_EXT2 0x0100
+#define MOPT_NO_EXT3 0x0200
+#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING 0x0400
+
+static const struct mount_opts {
+ int token;
+ int mount_opt;
+ int flags;
+} ext4_mount_opts[] = {
+ {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+ {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+ {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+ {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+ {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+ {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+ {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+ EXT4_MOUNT_JOURNAL_CHECKSUM),
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_CLEAR},
+ {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+ {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+ {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+ {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+ {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_GTE0},
+ {Opt_max_batch_time, 0, MOPT_GTE0},
+ {Opt_min_batch_time, 0, MOPT_GTE0},
+ {Opt_inode_readahead_blks, 0, MOPT_GTE0},
+ {Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_stripe, 0, MOPT_GTE0},
+ {Opt_resuid, 0, MOPT_GTE0},
+ {Opt_resgid, 0, MOPT_GTE0},
+ {Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_STRING},
+ {Opt_journal_ioprio, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+ MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+ {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+ {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
+#else
+ {Opt_acl, 0, MOPT_NOSUPPORT},
+ {Opt_noacl, 0, MOPT_NOSUPPORT},
+#endif
+ {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+ {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+ {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+ {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ {Opt_usrjquota, 0, MOPT_Q},
+ {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_offusrjquota, 0, MOPT_Q},
+ {Opt_offgrpjquota, 0, MOPT_Q},
+ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+ substring_t *args, unsigned long *journal_devnum,
+ unsigned int *journal_ioprio, int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const struct mount_opts *m;
+ kuid_t uid;
+ kgid_t gid;
+ int arg = 0;
+
+#ifdef CONFIG_QUOTA
+ if (token == Opt_usrjquota)
+ return set_qf_name(sb, USRQUOTA, &args[0]);
+ else if (token == Opt_grpjquota)
+ return set_qf_name(sb, GRPQUOTA, &args[0]);
+ else if (token == Opt_offusrjquota)
+ return clear_qf_name(sb, USRQUOTA);
+ else if (token == Opt_offgrpjquota)
+ return clear_qf_name(sb, GRPQUOTA);
+#endif
+ switch (token) {
+ case Opt_noacl:
+ case Opt_nouser_xattr:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+ break;
+ case Opt_sb:
+ return 1; /* handled by get_sb_block() */
+ case Opt_removed:
+ ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
+ return 1;
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
+ return 1;
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++)
+ if (token == m->token)
+ break;
+
+ if (m->token == Opt_err) {
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+ }
+
+ if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext2", opt);
+ return -1;
+ }
+ if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext3", opt);
+ return -1;
+ }
+
+ if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
+ return -1;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks must be "
+ "0 or a power of 2 smaller than 2^31");
+ return -1;
+ }
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (token == Opt_resuid) {
+ uid = make_kuid(current_user_ns(), arg);
+ if (!uid_valid(uid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+ return -1;
+ }
+ sbi->s_resuid = uid;
+ } else if (token == Opt_resgid) {
+ gid = make_kgid(current_user_ns(), arg);
+ if (!gid_valid(gid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+ return -1;
+ }
+ sbi->s_resgid = gid;
+ } else if (token == Opt_journal_dev) {
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ *journal_devnum = arg;
+ } else if (token == Opt_journal_path) {
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext4_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return -1;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return -1;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return -1;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ } else if (token == Opt_journal_ioprio) {
+ if (arg > 7) {
+ ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ " (must be 0-7)");
+ return -1;
+ }
+ *journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (m->flags & MOPT_DATAJ) {
+ if (is_remount) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change data mode on remount");
+ return -1;
+ }
+ } else {
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_mount_opt |= m->mount_opt;
+ }
+#ifdef CONFIG_QUOTA
+ } else if (m->flags & MOPT_QFMT) {
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
+ sbi->s_jquota_fmt = m->mount_opt;
+#endif
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
+ }
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
+ }
+ return 1;
+}
+
static int parse_options(char *options, struct super_block *sb,
unsigned long *journal_devnum,
unsigned int *journal_ioprio,
- ext4_fsblk_t *n_blocks_count, int is_remount)
+ int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *p;
substring_t args[MAX_OPT_ARGS];
- int data_opt = 0;
- int option;
-#ifdef CONFIG_QUOTA
- int qfmt;
-#endif
+ int token;
if (!options)
return 1;
while ((p = strsep(&options, ",")) != NULL) {
- int token;
if (!*p)
continue;
-
/*
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- clear_opt(sbi->s_mount_opt, MINIX_DF);
- break;
- case Opt_minix_df:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- set_opt(sbi->s_mount_opt, MINIX_DF);
-
- break;
- case Opt_grpid:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- set_opt(sbi->s_mount_opt, GRPID);
-
- break;
- case Opt_nogrpid:
- ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
- clear_opt(sbi->s_mount_opt, GRPID);
-
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resuid = option;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resgid = option;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt(sbi->s_mount_opt, ERRORS_CONT);
- clear_opt(sbi->s_mount_opt, ERRORS_RO);
- set_opt(sbi->s_mount_opt, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt(sbi->s_mount_opt, ERRORS_CONT);
- clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
- set_opt(sbi->s_mount_opt, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt(sbi->s_mount_opt, ERRORS_RO);
- clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
- set_opt(sbi->s_mount_opt, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt(sbi->s_mount_opt, NO_UID32);
- break;
- case Opt_debug:
- set_opt(sbi->s_mount_opt, DEBUG);
- break;
- case Opt_oldalloc:
- set_opt(sbi->s_mount_opt, OLDALLOC);
- break;
- case Opt_orlov:
- clear_opt(sbi->s_mount_opt, OLDALLOC);
- break;
-#ifdef CONFIG_EXT4_FS_XATTR
- case Opt_user_xattr:
- set_opt(sbi->s_mount_opt, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt(sbi->s_mount_opt, XATTR_USER);
- break;
-#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
-#else
- case Opt_acl:
- case Opt_noacl:
- ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
- break;
-#endif
- case Opt_journal_update:
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
- a journal file here. For now, only allow the
- user to specify an existing inode to be the
- journal file. */
- if (is_remount) {
- ext4_msg(sb, KERN_ERR,
- "Cannot specify journal on remount");
- return 0;
- }
- set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
- break;
- case Opt_journal_dev:
- if (is_remount) {
- ext4_msg(sb, KERN_ERR,
- "Cannot specify journal on remount");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *journal_devnum = option;
- break;
- case Opt_journal_checksum:
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
- case Opt_journal_async_commit:
- set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
- case Opt_noload:
- set_opt(sbi->s_mount_opt, NOLOAD);
- break;
- case Opt_commit:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = JBD2_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * option;
- break;
- case Opt_max_batch_time:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_max_batch_time = option;
- break;
- case Opt_min_batch_time:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_min_batch_time = option;
- break;
- case Opt_data_journal:
- data_opt = EXT4_MOUNT_JOURNAL_DATA;
- goto datacheck;
- case Opt_data_ordered:
- data_opt = EXT4_MOUNT_ORDERED_DATA;
- goto datacheck;
- case Opt_data_writeback:
- data_opt = EXT4_MOUNT_WRITEBACK_DATA;
- datacheck:
- if (is_remount) {
- if (test_opt(sb, DATA_FLAGS) != data_opt) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change data mode on remount");
- return 0;
- }
- } else {
- clear_opt(sbi->s_mount_opt, DATA_FLAGS);
- sbi->s_mount_opt |= data_opt;
- }
- break;
- case Opt_data_err_abort:
- set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
- break;
- case Opt_data_err_ignore:
- clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
- break;
-#ifdef CONFIG_QUOTA
- case Opt_usrjquota:
- if (!set_qf_name(sb, USRQUOTA, &args[0]))
- return 0;
- break;
- case Opt_grpjquota:
- if (!set_qf_name(sb, GRPQUOTA, &args[0]))
- return 0;
- break;
- case Opt_offusrjquota:
- if (!clear_qf_name(sb, USRQUOTA))
- return 0;
- break;
- case Opt_offgrpjquota:
- if (!clear_qf_name(sb, GRPQUOTA))
- return 0;
- break;
-
- case Opt_jqfmt_vfsold:
- qfmt = QFMT_VFS_OLD;
- goto set_qf_format;
- case Opt_jqfmt_vfsv0:
- qfmt = QFMT_VFS_V0;
- goto set_qf_format;
- case Opt_jqfmt_vfsv1:
- qfmt = QFMT_VFS_V1;
-set_qf_format:
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != qfmt) {
- ext4_msg(sb, KERN_ERR, "Cannot change "
- "journaled quota options when "
- "quota turned on");
- return 0;
- }
- sbi->s_jquota_fmt = qfmt;
- break;
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
- case Opt_noquota:
- if (sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
- return 0;
- }
- clear_opt(sbi->s_mount_opt, QUOTA);
- clear_opt(sbi->s_mount_opt, USRQUOTA);
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
-#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- ext4_msg(sb, KERN_ERR,
- "quota options not supported");
- break;
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- case Opt_jqfmt_vfsv1:
- ext4_msg(sb, KERN_ERR,
- "journaled quota options not supported");
- break;
- case Opt_noquota:
- break;
-#endif
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- break;
- case Opt_nobarrier:
- clear_opt(sbi->s_mount_opt, BARRIER);
- break;
- case Opt_barrier:
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = 1; /* No argument, default to 1 */
- if (option)
- set_opt(sbi->s_mount_opt, BARRIER);
- else
- clear_opt(sbi->s_mount_opt, BARRIER);
- break;
- case Opt_ignore:
- break;
- case Opt_resize:
- if (!is_remount) {
- ext4_msg(sb, KERN_ERR,
- "resize option only available "
- "for remount");
- return 0;
- }
- if (match_int(&args[0], &option) != 0)
- return 0;
- *n_blocks_count = option;
- break;
- case Opt_nobh:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated nobh option");
- break;
- case Opt_bh:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring deprecated bh option");
- break;
- case Opt_i_version:
- set_opt(sbi->s_mount_opt, I_VERSION);
- sb->s_flags |= MS_I_VERSION;
- break;
- case Opt_nodelalloc:
- clear_opt(sbi->s_mount_opt, DELALLOC);
- break;
- case Opt_stripe:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_stripe = option;
- break;
- case Opt_delalloc:
- set_opt(sbi->s_mount_opt, DELALLOC);
- break;
- case Opt_block_validity:
- set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
- break;
- case Opt_noblock_validity:
- clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
- break;
- case Opt_inode_readahead_blks:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0 || option > (1 << 30))
- return 0;
- if (!is_power_of_2(option)) {
- ext4_msg(sb, KERN_ERR,
- "EXT4-fs: inode_readahead_blks"
- " must be a power of 2");
- return 0;
- }
- sbi->s_inode_readahead_blks = option;
- break;
- case Opt_journal_ioprio:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0 || option > 7)
- break;
- *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
- option);
- break;
- case Opt_noauto_da_alloc:
- set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
- break;
- case Opt_auto_da_alloc:
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = 1; /* No argument, default to 1 */
- if (option)
- clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
- else
- set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
- break;
- case Opt_discard:
- set_opt(sbi->s_mount_opt, DISCARD);
- break;
- case Opt_nodiscard:
- clear_opt(sbi->s_mount_opt, DISCARD);
- break;
- case Opt_dioread_nolock:
- set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
- break;
- case Opt_dioread_lock:
- clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
- break;
- case Opt_init_inode_table:
- set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = EXT4_DEF_LI_WAIT_MULT;
- if (option < 0)
- return 0;
- sbi->s_li_wait_mult = option;
- break;
- case Opt_noinit_inode_table:
- clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
- break;
- default:
- ext4_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
- "or missing value", p);
+ if (handle_mount_opt(sb, p, token, args, journal_devnum,
+ journal_ioprio, is_remount) < 0)
return 0;
- }
}
#ifdef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+ ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+ "feature is enabled");
+ return 0;
+ }
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
- clear_opt(sbi->s_mount_opt, USRQUOTA);
+ clear_opt(sb, USRQUOTA);
if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
+ clear_opt(sb, GRPQUOTA);
if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1838,9 +1721,171 @@ set_qf_format:
}
}
#endif
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+ if (blocksize < PAGE_CACHE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ return 0;
+ }
+ }
return 1;
}
+static inline void ext4_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+#endif
+}
+
+static const char *token2str(int token)
+{
+ const struct match_token *t;
+
+ for (t = tokens; t->token != Opt_err; t++)
+ if (t->token == token && !strchr(t->pattern, '='))
+ break;
+ return t->pattern;
+}
+
+/*
+ * Show an option if
+ * - it's set to a non-default value OR
+ * - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+ int nodefs)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+ const struct mount_opts *m;
+ char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+ if (sbi->s_sb_block != 1)
+ SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ int want_set = m->flags & MOPT_SET;
+ if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+ (m->flags & MOPT_CLEAR_ERR))
+ continue;
+ if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+ continue; /* skip if same as the default */
+ if ((want_set &&
+ (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+ continue; /* select Opt_noFoo vs Opt_Foo */
+ SEQ_OPTS_PRINT("%s", token2str(m->token));
+ }
+
+ if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
+ le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ SEQ_OPTS_PRINT("resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
+ if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
+ le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ SEQ_OPTS_PRINT("resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
+ def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+ if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+ SEQ_OPTS_PUTS("errors=remount-ro");
+ if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+ SEQ_OPTS_PUTS("errors=continue");
+ if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+ SEQ_OPTS_PUTS("errors=panic");
+ if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+ if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+ SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+ if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+ SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (sb->s_flags & MS_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
+ if (nodefs || sbi->s_stripe)
+ SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+ if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ SEQ_OPTS_PUTS("data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ SEQ_OPTS_PUTS("data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ SEQ_OPTS_PUTS("data=writeback");
+ }
+ if (nodefs ||
+ sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+ SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+ sbi->s_inode_readahead_blks);
+
+ if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+ (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+ SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+ if (nodefs || sbi->s_max_dir_size_kb)
+ SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+
+ ext4_show_quota_options(seq, sb);
+ return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ int rc;
+
+ seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+ rc = _ext4_show_options(seq, sb, 1);
+ seq_puts(seq, "\n");
+ return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, options_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+ .owner = THIS_MODULE,
+ .open = options_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -1853,15 +1898,15 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
res = MS_RDONLY;
}
if (read_only)
- return res;
+ goto done;
if (!(sbi->s_mount_state & EXT4_VALID_FS))
ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
"running e2fsck is recommended");
- else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+ else if (sbi->s_mount_state & EXT4_ERROR_FS)
ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, "
"running e2fsck is recommended");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
ext4_msg(sb, KERN_WARNING,
@@ -1884,52 +1929,68 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
+done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
- "bpg=%lu, ipg=%lu, mo=%04x]\n",
+ "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
sb->s_blocksize,
sbi->s_groups_count,
EXT4_BLOCKS_PER_GROUP(sb),
EXT4_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
+ sbi->s_mount_opt, sbi->s_mount_opt2);
+ cleancache_init_fs(sb);
return res;
}
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups *new_groups;
+ int size;
+
+ if (!sbi->s_log_groups_per_flex)
+ return 0;
+
+ size = ext4_flex_group(sbi, ngroup - 1) + 1;
+ if (size <= sbi->s_flex_groups_allocated)
+ return 0;
+
+ size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+ new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groups) {
+ ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+ size / (int) sizeof(struct flex_groups));
+ return -ENOMEM;
+ }
+
+ if (sbi->s_flex_groups) {
+ memcpy(new_groups, sbi->s_flex_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups)));
+ ext4_kvfree(sbi->s_flex_groups);
+ }
+ sbi->s_flex_groups = new_groups;
+ sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ return 0;
+}
+
static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
- ext4_group_t flex_group_count;
ext4_group_t flex_group;
- int groups_per_flex = 0;
- size_t size;
- int i;
+ int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
- if (groups_per_flex < 2) {
+ if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- /* We allocate both existing and potentially added groups */
- flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
- ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
- EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
- size = flex_group_count * sizeof(struct flex_groups);
- sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
- if (sbi->s_flex_groups == NULL) {
- sbi->s_flex_groups = vmalloc(size);
- if (sbi->s_flex_groups)
- memset(sbi->s_flex_groups, 0, size);
- }
- if (sbi->s_flex_groups == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory for "
- "%u flex groups", flex_group_count);
+ err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+ if (err)
goto failed;
- }
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1937,8 +1998,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
flex_group = ext4_flex_group(sbi, i);
atomic_add(ext4_free_inodes_count(sb, gdp),
&sbi->s_flex_groups[flex_group].free_inodes);
- atomic_add(ext4_free_blks_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_add(ext4_free_group_clusters(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_clusters);
atomic_add(ext4_used_dirs_count(sb, gdp),
&sbi->s_flex_groups[flex_group].used_dirs);
}
@@ -1948,43 +2009,69 @@ failed:
return 0;
}
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
- struct ext4_group_desc *gdp)
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+ struct ext4_group_desc *gdp)
{
+ int offset;
__u16 crc = 0;
+ __le32 le_group = cpu_to_le32(block_group);
- if (sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- int offset = offsetof(struct ext4_group_desc, bg_checksum);
- __le32 le_group = cpu_to_le32(block_group);
-
- crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
- crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
- crc = crc16(crc, (__u8 *)gdp, offset);
- offset += sizeof(gdp->bg_checksum); /* skip checksum */
- /* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
- crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ if ((sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+ /* Use new metadata_csum algorithm */
+ __le16 save_csum;
+ __u32 csum32;
+
+ save_csum = gdp->bg_checksum;
+ gdp->bg_checksum = 0;
+ csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ sizeof(le_group));
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+ sbi->s_desc_size);
+ gdp->bg_checksum = save_csum;
+
+ crc = csum32 & 0xFFFF;
+ goto out;
}
+ /* old crc16 code */
+ offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+ crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+ crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+ crc = crc16(crc, (__u8 *)gdp, offset);
+ offset += sizeof(gdp->bg_checksum); /* skip checksum */
+ /* for checksum of struct ext4_group_desc do the rest...*/
+ if ((sbi->s_es->s_feature_incompat &
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ crc = crc16(crc, (__u8 *)gdp + offset,
+ le16_to_cpu(sbi->s_es->s_desc_size) -
+ offset);
+
+out:
return cpu_to_le16(crc);
}
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
- if ((sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
- (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+ block_group, gdp)))
return 0;
return 1;
}
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ if (!ext4_has_group_desc_csum(sb))
+ return;
+ gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
ext4_group_t *first_not_zeroed)
@@ -2039,7 +2126,7 @@ static int ext4_check_descriptors(struct super_block *sb,
return 0;
}
ext4_lock_group(sb, i);
- if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Checksum for group %u failed (%u!=%u)",
i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
@@ -2056,7 +2143,8 @@ static int ext4_check_descriptors(struct super_block *sb,
if (NULL != first_not_zeroed)
*first_not_zeroed = grp;
- ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+ ext4_free_blocks_count_set(sbi->s_es,
+ EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -2097,11 +2185,20 @@ static void ext4_orphan_cleanup(struct super_block *sb,
return;
}
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -2137,17 +2234,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
dquot_initialize(inode);
if (inode->i_nlink) {
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
+ mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -2180,6 +2282,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
* in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
* so that won't be a limiting factor.
*
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
static loff_t ext4_max_size(int blkbits, int has_huge_files)
@@ -2201,10 +2309,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
upper_limit <<= blkbits;
}
- /* 32-bit extent-start container, ee_block */
- res = 1LL << 32;
+ /*
+ * 32-bit extent-start container, ee_block. We lower the maxbytes
+ * by one fs block, so ee_len can cover the extent of maximum file
+ * size
+ */
+ res = (1LL << 32) - 1;
res <<= blkbits;
- res -= 1;
/* Sanity check against vm- & vfs- imposed limits */
if (res > upper_limit)
@@ -2292,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
if (ext4_bg_has_super(sb, bg))
has_super = 1;
+ /*
+ * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+ * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled
+ * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+ * compensate.
+ */
+ if (sb->s_blocksize == 1024 && nr == 0 &&
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+ has_super++;
+
return (has_super + ext4_group_first_block_no(sb, bg));
}
@@ -2311,17 +2432,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
unsigned long stripe_width =
le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+ int ret;
if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
- return sbi->s_stripe;
-
- if (stripe_width <= sbi->s_blocks_per_group)
- return stripe_width;
+ ret = sbi->s_stripe;
+ else if (stripe_width <= sbi->s_blocks_per_group)
+ ret = stripe_width;
+ else if (stride <= sbi->s_blocks_per_group)
+ ret = stride;
+ else
+ ret = 0;
- if (stride <= sbi->s_blocks_per_group)
- return stride;
+ /*
+ * If the stripe width is 1, this makes no sense and
+ * we set it to 0 to turn off stripe handling code.
+ */
+ if (ret <= 1)
+ ret = 0;
- return 0;
+ return ret;
}
/* sysfs supprt */
@@ -2331,20 +2460,21 @@ struct ext4_attr {
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
- int offset;
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
};
-static int parse_strtoul(const char *buf,
- unsigned long max, unsigned long *value)
+static int parse_strtoull(const char *buf,
+ unsigned long long max, unsigned long long *value)
{
- char *endp;
-
- *value = simple_strtoul(skip_spaces(buf), &endp, 0);
- endp = skip_spaces(endp);
- if (*endp || *value > max)
- return -EINVAL;
+ int ret;
- return 0;
+ ret = kstrtoull(skip_spaces(buf), 0, value);
+ if (!ret && *value > max)
+ ret = -EINVAL;
+ return ret;
}
static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2352,7 +2482,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
}
static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2385,11 +2516,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0x40000000, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
- if (!is_power_of_2(t))
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
@@ -2399,7 +2532,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
static ssize_t sbi_ui_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -2408,21 +2541,69 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
unsigned long t;
+ int ret;
- if (parse_strtoul(buf, 0xffffffff, &t))
- return -EINVAL;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
*ui = t;
return count;
}
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ int ret;
+
+ if (parse_strtoull(buf, -1ULL, &val))
+ return -EINVAL;
+ ret = ext4_reserve_clusters(sbi, val);
+
+ return ret ? ret : count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct ext4_sb_info, _elname), \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
}
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2433,10 +2614,19 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2446,12 +2636,21 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -2461,16 +2660,26 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
NULL,
};
/* Features this copy of ext4 supports */
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
NULL,
};
@@ -2564,6 +2773,23 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't support bigalloc feature without "
+ "extents feature\n");
+ return 0;
+ }
+
+#ifndef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !readonly) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return 0;
+ }
+#endif /* CONFIG_QUOTA */
return 1;
}
@@ -2581,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg)
es = sbi->s_es;
if (es->s_error_count)
- ext4_msg(sb, KERN_NOTICE, "error count: %u",
+ /* fsck newer than v1.41.13 is needed to clean this condition. */
+ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
le32_to_cpu(es->s_error_count));
if (es->s_first_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_first_error_time),
(int) sizeof(es->s_first_error_func),
es->s_first_error_func,
@@ -2598,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg)
printk("\n");
}
if (es->s_last_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_last_error_time),
(int) sizeof(es->s_last_error_func),
es->s_last_error_func,
@@ -2614,12 +2841,6 @@ static void print_daily_error_info(unsigned long arg)
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
}
-static void ext4_lazyinode_timeout(unsigned long data)
-{
- struct task_struct *p = (struct task_struct *)data;
- wake_up_process(p);
-}
-
/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
@@ -2632,6 +2853,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
sb = elr->lr_super;
ngroups = EXT4_SB(sb)->s_groups_count;
+ sb_start_write(sb);
for (group = elr->lr_next_group; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
@@ -2643,7 +2865,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
break;
}
- if (group == ngroups)
+ if (group >= ngroups)
ret = 1;
if (!ret) {
@@ -2651,23 +2873,21 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
if (elr->lr_timeout == 0) {
- timeout = jiffies - timeout;
- if (elr->lr_sbi->s_li_wait_mult)
- timeout *= elr->lr_sbi->s_li_wait_mult;
- else
- timeout *= 20;
+ timeout = (jiffies - timeout) *
+ elr->lr_sbi->s_li_wait_mult;
elr->lr_timeout = timeout;
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
}
+ sb_end_write(sb);
return ret;
}
/*
* Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
@@ -2685,16 +2905,20 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
static void ext4_unregister_li_request(struct super_block *sb)
{
- struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
- if (!ext4_li_info)
+ mutex_lock(&ext4_li_mtx);
+ if (!ext4_li_info) {
+ mutex_unlock(&ext4_li_mtx);
return;
+ }
mutex_lock(&ext4_li_info->li_list_mtx);
- ext4_remove_li_request(elr);
+ ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
mutex_unlock(&ext4_li_info->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
}
+static struct task_struct *ext4_lazyinit_task;
+
/*
* This is the function where ext4lazyinit thread lives. It walks
* through the request list searching for next scheduled filesystem.
@@ -2709,17 +2933,10 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
- unsigned long next_wakeup;
- DEFINE_WAIT(wait);
+ unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
- eli->li_timer.data = (unsigned long)current;
- eli->li_timer.function = ext4_lazyinode_timeout;
-
- eli->li_task = current;
- wake_up(&eli->li_wait_task);
-
cont_thread:
while (true) {
next_wakeup = MAX_JIFFY_OFFSET;
@@ -2747,22 +2964,21 @@ cont_thread:
}
mutex_unlock(&eli->li_list_mtx);
- if (freezing(current))
- refrigerator();
+ try_to_freeze();
- if ((time_after_eq(jiffies, next_wakeup)) ||
+ cur = jiffies;
+ if ((time_after_eq(cur, next_wakeup)) ||
(MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched();
continue;
}
- eli->li_timer.expires = next_wakeup;
- add_timer(&eli->li_timer);
- prepare_to_wait(&eli->li_wait_daemon, &wait,
- TASK_INTERRUPTIBLE);
- if (time_before(jiffies, next_wakeup))
- schedule();
- finish_wait(&eli->li_wait_daemon, &wait);
+ schedule_timeout_interruptible(next_wakeup - cur);
+
+ if (kthread_should_stop()) {
+ ext4_clear_request_list();
+ goto exit_thread;
+ }
}
exit_thread:
@@ -2782,10 +2998,6 @@ exit_thread:
goto cont_thread;
}
mutex_unlock(&eli->li_list_mtx);
- del_timer_sync(&ext4_li_info->li_timer);
- eli->li_task = NULL;
- wake_up(&eli->li_wait_task);
-
kfree(ext4_li_info);
ext4_li_info = NULL;
mutex_unlock(&ext4_li_mtx);
@@ -2799,9 +3011,6 @@ static void ext4_clear_request_list(void)
struct ext4_li_request *elr;
mutex_lock(&ext4_li_info->li_list_mtx);
- if (list_empty(&ext4_li_info->li_request_list))
- return;
-
list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
elr = list_entry(pos, struct ext4_li_request,
lr_request);
@@ -2812,23 +3021,19 @@ static void ext4_clear_request_list(void)
static int ext4_run_lazyinit_thread(void)
{
- struct task_struct *t;
-
- t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
- if (IS_ERR(t)) {
- int err = PTR_ERR(t);
+ ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+ ext4_li_info, "ext4lazyinit");
+ if (IS_ERR(ext4_lazyinit_task)) {
+ int err = PTR_ERR(ext4_lazyinit_task);
ext4_clear_request_list();
- del_timer_sync(&ext4_li_info->li_timer);
kfree(ext4_li_info);
ext4_li_info = NULL;
- printk(KERN_CRIT "EXT4: error %d creating inode table "
+ printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
"initialization thread\n",
err);
return err;
}
ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
- wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
return 0;
}
@@ -2863,13 +3068,9 @@ static int ext4_li_info_new(void)
if (!eli)
return -ENOMEM;
- eli->li_task = NULL;
INIT_LIST_HEAD(&eli->li_request_list);
mutex_init(&eli->li_list_mtx);
- init_waitqueue_head(&eli->li_wait_daemon);
- init_waitqueue_head(&eli->li_wait_task);
- init_timer(&eli->li_timer);
eli->li_state |= EXT4_LAZYINIT_QUIT;
ext4_li_info = eli;
@@ -2882,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
- unsigned long rnd;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
if (!elr)
@@ -2897,41 +3097,39 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- get_random_bytes(&rnd, sizeof(rnd));
- elr->lr_next_sched = jiffies + (unsigned long)rnd %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ);
-
+ elr->lr_next_sched = jiffies + (prandom_u32() %
+ (EXT4_DEF_LI_MAX_START_DELAY * HZ));
return elr;
}
-static int ext4_register_li_request(struct super_block *sb,
- ext4_group_t first_not_zeroed)
+int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_li_request *elr;
+ struct ext4_li_request *elr = NULL;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
- int ret;
+ int ret = 0;
- if (sbi->s_li_request != NULL)
- return 0;
+ mutex_lock(&ext4_li_mtx);
+ if (sbi->s_li_request != NULL) {
+ /*
+ * Reset timeout so it can be computed again, because
+ * s_li_wait_mult might have changed.
+ */
+ sbi->s_li_request->lr_timeout = 0;
+ goto out;
+ }
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
- !test_opt(sb, INIT_INODE_TABLE)) {
- sbi->s_li_request = NULL;
- return 0;
- }
-
- if (first_not_zeroed == ngroups) {
- sbi->s_li_request = NULL;
- return 0;
- }
+ !test_opt(sb, INIT_INODE_TABLE))
+ goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
- if (!elr)
- return -ENOMEM;
-
- mutex_lock(&ext4_li_mtx);
+ if (!elr) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (NULL == ext4_li_info) {
ret = ext4_li_info_new();
@@ -2944,6 +3142,12 @@ static int ext4_register_li_request(struct super_block *sb,
mutex_unlock(&ext4_li_info->li_list_mtx);
sbi->s_li_request = elr;
+ /*
+ * set elr to NULL here since it has been inserted to
+ * the request_list and the removal and free of it is
+ * handled by ext4_clear_request_list from now on.
+ */
+ elr = NULL;
if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
ret = ext4_run_lazyinit_thread();
@@ -2967,21 +3171,209 @@ static void ext4_destroy_lazyinit_thread(void)
* If thread exited earlier
* there's nothing to be done.
*/
- if (!ext4_li_info)
+ if (!ext4_li_info || !ext4_lazyinit_task)
return;
- ext4_clear_request_list();
+ kthread_stop(ext4_lazyinit_task);
+}
- while (ext4_li_info->li_task) {
- wake_up(&ext4_li_info->li_wait_daemon);
- wait_event(ext4_li_info->li_wait_task,
- ext4_li_info->li_task == NULL);
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+ int ret = 1;
+ int compat, incompat;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ /* journal checksum v2 */
+ compat = 0;
+ incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ } else {
+ /* journal checksum v1 */
+ compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+ incompat = 0;
+ }
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ incompat);
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ incompat);
+ jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ JBD2_FEATURE_INCOMPAT_CSUM_V2);
}
+
+ return ret;
+}
+
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc. This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time. We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock. If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int count_overhead(struct super_block *sb, ext4_group_t grp,
+ char *buf)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_fsblk_t first_block, last_block, b;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ int s, j, count = 0;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ sbi->s_itb_per_group + 2);
+
+ first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
+ (grp * EXT4_BLOCKS_PER_GROUP(sb));
+ last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ b = ext4_block_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_table(sb, gdp);
+ if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
+ for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
+ int c = EXT4_B2C(sbi, b - first_block);
+ ext4_set_bit(c, buf);
+ count++;
+ }
+ if (i != grp)
+ continue;
+ s = 0;
+ if (ext4_bg_has_super(sb, grp)) {
+ ext4_set_bit(s++, buf);
+ count++;
+ }
+ for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+ ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+ count++;
+ }
+ }
+ if (!count)
+ return 0;
+ return EXT4_CLUSTERS_PER_GROUP(sb) -
+ ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+}
+
+/*
+ * Compute the overhead and stash it in sbi->s_overhead
+ */
+int ext4_calculate_overhead(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ ext4_fsblk_t overhead = 0;
+ char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * Compute the overhead (FS structures). This is constant
+ * for a given filesystem unless the number of block groups
+ * changes so we cache the previous value until it does.
+ */
+
+ /*
+ * All of the blocks before first_data_block are overhead
+ */
+ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+ /*
+ * Add the overhead found in each block group
+ */
+ for (i = 0; i < ngroups; i++) {
+ int blks;
+
+ blks = count_overhead(sb, i, buf);
+ overhead += blks;
+ if (blks)
+ memset(buf, 0, PAGE_SIZE);
+ cond_resched();
+ }
+ /* Add the journal blocks as well */
+ if (sbi->s_journal)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+
+ sbi->s_overhead = overhead;
+ smp_wmb();
+ free_page((unsigned long) buf);
+ return 0;
+}
+
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+{
+ ext4_fsblk_t resv_clusters;
+
+ /*
+ * There's no need to reserve anything when we aren't using extents.
+ * The space estimates are exact, there are no unwritten extents,
+ * hole punching doesn't need new metadata... This is needed especially
+ * to keep ext2/3 backward compatibility.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ return 0;
+ /*
+ * By default we reserve 2% or 4096 clusters, whichever is smaller.
+ * This should cover the situations where we can not afford to run
+ * out of space like for example punch hole, or converting
+ * unwritten extents in delalloc path. In most cases such
+ * allocation would require 1, or 2 blocks, higher numbers are
+ * very rare.
+ */
+ resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+ EXT4_SB(sb)->s_cluster_bits;
+
+ do_div(resv_clusters, 50);
+ resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+ return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+ ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits;
+
+ if (count >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, count);
+ return 0;
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
- __releases(kernel_lock)
- __acquires(kernel_lock)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
@@ -2997,12 +3389,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
char *cp;
const char *descr;
int ret = -ENOMEM;
- int blocksize;
+ int blocksize, clustersize;
unsigned int db_count;
unsigned int i;
- int needs_recovery, has_huge_files;
+ int needs_recovery, has_huge_files, has_bigalloc;
__u64 blocks_count;
- int err;
+ int err = 0;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
@@ -3017,9 +3409,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_orig;
}
sb->s_fs_info = sbi;
- sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT4_DEF_RESUID;
- sbi->s_resgid = EXT4_DEF_RESGID;
+ sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
@@ -3030,6 +3420,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (cp = sb->s_id; (cp = strchr(cp, '/'));)
*cp = '!';
+ /* -EINVAL is default */
ret = -EINVAL;
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
@@ -3056,78 +3447,138 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* Note: s_es must be initialized as soon as possible because
* some ext4 macro-instructions depend on its value
*/
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT4_SUPER_MAGIC)
goto cantfind_ext4;
sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Load the checksum driver */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ ret = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto failed_mount;
+ }
+ }
+
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+ set_opt(sb, INIT_INODE_TABLE);
if (def_mount_opts & EXT4_DEFM_DEBUG)
- set_opt(sbi->s_mount_opt, DEBUG);
- if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
- ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
- "2.6.38");
- set_opt(sbi->s_mount_opt, GRPID);
- }
+ set_opt(sb, DEBUG);
+ if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+ set_opt(sb, GRPID);
if (def_mount_opts & EXT4_DEFM_UID16)
- set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4_FS_XATTR
- if (def_mount_opts & EXT4_DEFM_XATTR_USER)
- set_opt(sbi->s_mount_opt, XATTR_USER);
-#endif
+ set_opt(sb, NO_UID32);
+ /* xattr user namespace & acls are now defaulted on */
+ set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- if (def_mount_opts & EXT4_DEFM_ACL)
- set_opt(sbi->s_mount_opt, POSIX_ACL);
+ set_opt(sb, POSIX_ACL);
#endif
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
- set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ set_opt(sb, ORDERED_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
- set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+ set_opt(sb, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
- set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sb, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
- set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ set_opt(sb, ERRORS_CONT);
else
- set_opt(sbi->s_mount_opt, ERRORS_RO);
+ set_opt(sb, ERRORS_RO);
if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
- set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+ set_opt(sb, BLOCK_VALIDITY);
if (def_mount_opts & EXT4_DEFM_DISCARD)
- set_opt(sbi->s_mount_opt, DISCARD);
+ set_opt(sb, DISCARD);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
- set_opt(sbi->s_mount_opt, BARRIER);
+ set_opt(sb, BARRIER);
/*
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
- if (!IS_EXT3_SB(sb) &&
+ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
- set_opt(sbi->s_mount_opt, DELALLOC);
+ set_opt(sb, DELALLOC);
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
- &journal_devnum, &journal_ioprio, NULL, 0)) {
+ &journal_devnum, &journal_ioprio, 0)) {
ext4_msg(sb, KERN_WARNING,
"failed to parse options in superblock: %s",
sbi->s_es->s_mount_opts);
}
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
if (!parse_options((char *) data, sb, &journal_devnum,
- &journal_ioprio, NULL, 0))
+ &journal_ioprio, 0))
goto failed_mount;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+ "with data=journal disables delayed "
+ "allocation and O_DIRECT support!\n");
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ }
+
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3139,6 +3590,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
+ if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+ set_opt2(sb, HURD_COMPAT);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "The Hurd can't support 64-bit file systems");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT2_SB(sb)) {
+ if (ext2_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT3_SB(sb)) {
+ if (ext3_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -3148,7 +3631,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
@@ -3173,7 +3655,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"Can't read superblock on 2nd try");
goto failed_mount;
}
- es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *)(bh->b_data + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
ext4_msg(sb, KERN_ERR,
@@ -3238,25 +3720,71 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- i = le32_to_cpu(es->s_flags);
- if (i & EXT2_FLAGS_UNSIGNED_HASH)
- sbi->s_hash_unsigned = 3;
- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
- sbi->s_hash_unsigned = 3;
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
#else
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
- sb->s_dirt = 1;
+ }
}
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ if (has_bigalloc) {
+ if (clustersize < blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%d)", clustersize, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ } else {
+ if (clustersize != blocksize) {
+ ext4_warning(sb, "fragment/cluster size (%d) != "
+ "block size (%d)", clustersize,
+ blocksize);
+ clustersize = blocksize;
+ }
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / blocksize;
+
if (sbi->s_inodes_per_group > blocksize * 8) {
ext4_msg(sb, KERN_ERR,
"#inodes per group too big: %lu",
@@ -3264,13 +3792,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
/*
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
*/
- ret = generic_check_addressable(sb->s_blocksize_bits,
+ err = generic_check_addressable(sb->s_blocksize_bits,
ext4_blocks_count(es));
- if (ret) {
+ if (err) {
ext4_msg(sb, KERN_ERR, "filesystem"
" too large to mount safely on this system");
if (sizeof(sector_t) < 8)
@@ -3295,7 +3827,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* of the filesystem.
*/
if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
"block %u is beyond end of filesystem (%llu)",
le32_to_cpu(es->s_first_data_block),
ext4_blocks_count(es));
@@ -3319,17 +3851,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
- GFP_KERNEL);
+ sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
+ ret = -ENOMEM;
goto failed_mount;
}
-#ifdef CONFIG_PROC_FS
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-#endif
+
+ if (sbi->s_proc)
+ proc_create_data("options", S_IRUGO, sbi->s_proc,
+ &ext4_seq_options_fops, sb);
bgl_lock_init(sbi->s_blockgroup_lock);
@@ -3359,8 +3895,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
+ init_timer(&sbi->s_err_report);
+ sbi->s_err_report.function = print_daily_error_info;
+ sbi->s_err_report.data = (unsigned long) sb;
+
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sbi);
+
+ err = percpu_counter_init(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
@@ -3370,7 +3913,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_count_dirs(sb));
}
if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
}
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3378,7 +3924,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
- sbi->s_max_writeback_mb_bump = 128;
+ sbi->s_extent_max_zeroout_kb = 32;
/*
* set up enough so that it can read an inode
@@ -3391,12 +3937,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
- sb->s_qcop = &ext4_qctl_operations;
sb->dq_op = &ext4_quota_operations;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ sb->s_qcop = &ext4_qctl_sysfile_operations;
+ else
+ sb->s_qcop = &ext4_qctl_operations;
#endif
+ memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- mutex_init(&sbi->s_resize_lock);
sb->s_root = NULL;
@@ -3404,6 +3954,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_RECOVER));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ goto failed_mount3;
+
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
@@ -3418,33 +3973,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"suppressed and not mounted read-only");
goto failed_mount_wq;
} else {
- clear_opt(sbi->s_mount_opt, DATA_FLAGS);
- set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+ clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
}
- if (ext4_blocks_count(es) > 0xffffffffULL &&
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
goto failed_mount_wq;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
- jbd2_journal_clear_features(sbi->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto failed_mount_wq;
}
/* We have now updated the journal if required, so we can
@@ -3457,9 +4002,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
if (jbd2_journal_check_available_features
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ set_opt(sb, ORDERED_DATA);
else
- set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ set_opt(sb, JOURNAL_DATA);
break;
case EXT4_MOUNT_ORDERED_DATA:
@@ -3475,23 +4020,51 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
/*
* The journal may have updated the bg summary counts, so we
* need to update the global counters.
*/
- percpu_counter_set(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
+ percpu_counter_set(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
percpu_counter_set(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
percpu_counter_set(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
- percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+ percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
no_journal:
- EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
- if (!EXT4_SB(sb)->dio_unwritten_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
- goto failed_mount_wq;
+ if (ext4_mballoc_ready) {
+ sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
+ }
+ }
+
+ /*
+ * Get the # of file system overhead blocks from the
+ * superblock if present.
+ */
+ if (es->s_overhead_clusters)
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ else {
+ err = ext4_calculate_overhead(sb);
+ if (err)
+ goto failed_mount_wq;
+ }
+
+ /*
+ * The maximum number of concurrent works can be high and
+ * concurrency isn't really necessary. Limit it to 1.
+ */
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
}
/*
@@ -3503,22 +4076,23 @@ no_journal:
if (IS_ERR(root)) {
ext4_msg(sb, KERN_ERR, "get root inode failed");
ret = PTR_ERR(root);
+ root = NULL;
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
- iput(root);
ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+ iput(root);
goto failed_mount4;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
- iput(root);
ret = -ENOMEM;
goto failed_mount4;
}
- ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
+ if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -3545,53 +4119,48 @@ no_journal:
"available");
}
- if (test_opt(sb, DELALLOC) &&
- (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
- ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
- "requested data journaling mode");
- clear_opt(sbi->s_mount_opt, DELALLOC);
- }
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
- "option - requested data journaling mode");
- clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
- }
- if (sb->s_blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
- "option - block size is too small");
- clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
- }
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+ "reserved pool", ext4_calculate_resv_clusters(sb));
+ goto failed_mount4a;
}
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
"zone (%d)", err);
- goto failed_mount4;
+ goto failed_mount4a;
}
ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
+ err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
err);
- goto failed_mount4;
+ goto failed_mount5;
}
err = ext4_register_li_request(sb, first_not_zeroed);
if (err)
- goto failed_mount4;
+ goto failed_mount6;
sbi->s_kobj.kset = ext4_kset;
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
"%s", sb->s_id);
- if (err) {
- ext4_mb_release(sb);
- ext4_ext_release(sb);
- goto failed_mount4;
- };
+ if (err)
+ goto failed_mount7;
+
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto failed_mount8;
+ }
+#endif /* CONFIG_QUOTA */
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -3610,16 +4179,26 @@ no_journal:
} else
descr = "out journal";
+ if (test_opt(sb, DISCARD)) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ if (!blk_queue_discard(q))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
+
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
- init_timer(&sbi->s_err_report);
- sbi->s_err_report.function = print_daily_error_info;
- sbi->s_err_report.data = (unsigned long) sb;
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
kfree(orig_data);
return 0;
@@ -3628,32 +4207,50 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+#ifdef CONFIG_QUOTA
+failed_mount8:
+ kobject_del(&sbi->s_kobj);
+#endif
+failed_mount7:
+ ext4_unregister_li_request(sb);
+failed_mount6:
+ ext4_mb_release(sb);
+failed_mount5:
+ ext4_ext_release(sb);
+ ext4_release_system_zone(sb);
+failed_mount4a:
+ dput(sb->s_root);
+ sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
- ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
- if (sbi->s_flex_groups) {
- if (is_vmalloc_addr(sbi->s_flex_groups))
- vfree(sbi->s_flex_groups);
- else
- kfree(sbi->s_flex_groups);
- }
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ ext4_es_unregister_shrinker(sbi);
+ del_timer_sync(&sbi->s_err_report);
+ if (sbi->s_flex_groups)
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+ ext4_kvfree(sbi->s_group_desc);
failed_mount:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
#ifdef CONFIG_QUOTA
@@ -3668,7 +4265,7 @@ out_fail:
kfree(sbi);
out_free_orig:
kfree(orig_data);
- return ret;
+ return err ? err : ret;
}
/*
@@ -3758,13 +4355,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if (bdev == NULL)
return NULL;
- if (bd_claim(bdev, sb)) {
- ext4_msg(sb, KERN_ERR,
- "failed to claim external journal device");
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
- return NULL;
- }
-
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -3782,7 +4372,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -3809,7 +4399,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -3890,15 +4480,6 @@ static int ext4_load_journal(struct super_block *sb,
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
- if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
- err = jbd2_journal_update_format(journal);
- if (err) {
- ext4_msg(sb, KERN_ERR, "error updating journal");
- jbd2_journal_destroy(journal);
- return err;
- }
- }
-
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
@@ -3939,7 +4520,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh)
+ if (!sbh || block_device_ejected(sb))
return error;
if (buffer_write_io_error(sbh)) {
/*
@@ -3975,13 +4556,14 @@ static int ext4_commit_super(struct super_block *sb, int sync)
else
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeblocks_counter));
+ ext4_free_blocks_count_set(es,
+ EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeclusters_counter)));
es->s_free_inodes_count =
cpu_to_le32(percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeinodes_counter));
- sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
+ ext4_superblock_csum_set(sb);
mark_buffer_dirty(sbh);
if (sync) {
error = sync_dirty_buffer(sbh);
@@ -4062,6 +4644,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
ext4_commit_super(sb, 1);
jbd2_journal_clear_err(journal);
+ jbd2_journal_update_sb_errno(journal);
}
}
@@ -4072,45 +4655,72 @@ static void ext4_clear_journal_err(struct super_block *sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret = 0;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
- ret = ext4_journal_force_commit(journal);
- }
-
- return ret;
-}
-
-static void ext4_write_super(struct super_block *sb)
-{
- lock_super(sb);
- ext4_commit_super(sb, 1);
- unlock_super(sb);
+ return ext4_journal_force_commit(journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ /*
+ * Writeback quota in non-journalled quota case - journalled quota has
+ * no dirty dquots
+ */
+ dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(sbi->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
}
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
return ret;
}
/*
* LVM calls this function before a (read-only) snapshot is created. This
* gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
*/
static int ext4_freeze(struct super_block *sb)
{
@@ -4137,7 +4747,7 @@ static int ext4_freeze(struct super_block *sb)
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
out:
- /* we rely on s_frozen to stop further updates */
+ /* we rely on upper layer to stop further updates */
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return error;
}
@@ -4151,34 +4761,47 @@ static int ext4_unfreeze(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
- lock_super(sb);
/* Reset the needs_recovery flag before the fs is unlocked. */
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
- unlock_super(sb);
return 0;
}
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+ unsigned long s_mount_opt;
+ unsigned long s_mount_opt2;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_commit_interval;
+ u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
int enable_quota = 0;
ext4_group_t g;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- int err;
+ int err = 0;
#ifdef CONFIG_QUOTA
- int i;
+ int i, j;
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
/* Store the original options */
- lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
+ old_opts.s_mount_opt2 = sbi->s_mount_opt2;
old_opts.s_resuid = sbi->s_resuid;
old_opts.s_resgid = sbi->s_resgid;
old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4187,7 +4810,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ kfree(orig_data);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
@@ -4195,12 +4828,26 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
/*
* Allow the "check" option to be passed as a remount option.
*/
- if (!parse_options(data, sb, NULL, &journal_ioprio,
- &n_blocks_count, 1)) {
+ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, "Abort forced by user");
@@ -4214,14 +4861,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
- if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
- n_blocks_count > ext4_blocks_count(es)) {
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
goto restore_opts;
}
if (*flags & MS_RDONLY) {
+ err = sync_filesystem(sb);
+ if (err < 0)
+ goto restore_opts;
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
@@ -4257,7 +4906,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
struct ext4_group_desc *gdp =
ext4_get_group_desc(sb, g, NULL);
- if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+ if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
ext4_msg(sb, KERN_ERR,
"ext4_remount: Checksum for group %u failed (%u!=%u)",
g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
@@ -4290,10 +4939,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if ((err = ext4_group_extend(sb, es, n_blocks_count)))
- goto restore_opts;
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block))) {
+ err = -EROFS;
+ goto restore_opts;
+ }
enable_quota = 1;
}
}
@@ -4311,19 +4965,24 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL)
+ if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
+ if (enable_quota) {
+ if (sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
+ else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto restore_opts;
+ }
+ }
#endif
- unlock_super(sb);
- if (enable_quota)
- dquot_resume(sb, -1);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
@@ -4332,6 +4991,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
restore_opts:
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
+ sbi->s_mount_opt2 = old_opts.s_mount_opt2;
sbi->s_resuid = old_opts.s_resuid;
sbi->s_resgid = old_opts.s_resgid;
sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4340,13 +5000,10 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
- unlock_super(sb);
kfree(orig_data);
return err;
}
@@ -4356,54 +5013,24 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
+ s64 bfree;
+ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
- if (test_opt(sb, MINIX_DF)) {
- sbi->s_overhead_last = 0;
- } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
- ext4_group_t i, ngroups = ext4_get_groups_count(sb);
- ext4_fsblk_t overhead = 0;
-
- /*
- * Compute the overhead (FS structures). This is constant
- * for a given filesystem unless the number of block groups
- * changes so we cache the previous value until it does.
- */
-
- /*
- * All of the blocks before first_data_block are
- * overhead
- */
- overhead = le32_to_cpu(es->s_first_data_block);
-
- /*
- * Add the overhead attributed to the superblock and
- * block group descriptors. If the sparse superblocks
- * feature is turned on, then not all groups have this.
- */
- for (i = 0; i < ngroups; i++) {
- overhead += ext4_bg_has_super(sb, i) +
- ext4_bg_num_gdb(sb, i);
- cond_resched();
- }
-
- /*
- * Every block group has an inode bitmap, a block
- * bitmap, and an inode table.
- */
- overhead += ngroups * (2 + sbi->s_itb_per_group);
- sbi->s_overhead_last = overhead;
- smp_wmb();
- sbi->s_blocks_last = ext4_blocks_count(es);
- }
+ if (!test_opt(sb, MINIX_DF))
+ overhead = sbi->s_overhead;
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
- percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
- buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
- if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
+ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+ /* prevent underflow in case that few free space is available */
+ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+ buf->f_bavail = buf->f_bfree -
+ (ext4_r_blocks_count(es) + resv_blocks);
+ if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4430,7 +5057,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}
static int ext4_write_dquot(struct dquot *dquot)
@@ -4440,7 +5067,7 @@ static int ext4_write_dquot(struct dquot *dquot)
struct inode *inode;
inode = dquot_to_inode(dquot);
- handle = ext4_journal_start(inode,
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4456,7 +5083,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4472,7 +5099,7 @@ static int ext4_release_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
@@ -4488,9 +5115,12 @@ static int ext4_release_dquot(struct dquot *dquot)
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
+ struct super_block *sb = dquot->dq_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
/* Are we journaling quotas? */
- if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -4504,7 +5134,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -4528,27 +5158,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
* Standard function to be called on quota_on
*/
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *name)
+ struct path *path)
{
int err;
- struct path path;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- err = kern_path(name, LOOKUP_FOLLOW, &path);
- if (err)
- return err;
-
/* Quotafile not on the same filesystem? */
- if (path.mnt->mnt_sb != sb) {
- path_put(&path);
+ if (path->dentry->d_sb != sb)
return -EXDEV;
- }
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
- if (path.dentry->d_parent != sb->s_root)
+ if (path->dentry->d_parent != sb->s_root)
ext4_msg(sb, KERN_WARNING,
"Quota file not on filesystem root. "
"Journaled quota will not work");
@@ -4559,7 +5182,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* all updates to the file when we bypass pagecache...
*/
if (EXT4_SB(sb)->s_journal &&
- ext4_should_journal_data(path.dentry->d_inode)) {
+ ext4_should_journal_data(path->dentry->d_inode)) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
@@ -4567,30 +5190,124 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
- if (err) {
- path_put(&path);
+ if (err)
return err;
- }
}
- err = dquot_quota_on_path(sb, type, format_id, &path);
- path_put(&path);
+ return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+ struct inode *qf_inode;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+
+ if (!qf_inums[type])
+ return -EPERM;
+
+ qf_inode = ext4_iget(sb, qf_inums[type]);
+ if (IS_ERR(qf_inode)) {
+ ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ return PTR_ERR(qf_inode);
+ }
+
+ /* Don't account quota for quota files to avoid recursion */
+ qf_inode->i_flags |= S_NOQUOTA;
+ err = dquot_enable(qf_inode, type, format_id, flags);
+ iput(qf_inode);
+
return err;
}
+/* Enable usage tracking for all quota types. */
+static int ext4_enable_quotas(struct super_block *sb)
+{
+ int type, err = 0;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (qf_inums[type]) {
+ err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
+ DQUOT_USAGE_ENABLED);
+ if (err) {
+ ext4_warning(sb,
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * quota_on function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /*
+ * USAGE was enabled at mount time. Only need to enable LIMITS now.
+ */
+ return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
+}
+
static int ext4_quota_off(struct super_block *sb, int type)
{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ handle_t *handle;
+
/* Force all delayed allocation blocks to be allocated.
* Caller already holds s_umount sem */
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
+ if (!inode)
+ goto out;
+
+ /* Update modification times of quota files when userspace can
+ * start looking at them */
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+ if (IS_ERR(handle))
+ goto out;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out:
return dquot_quota_off(sb, type);
}
+/*
+ * quota_off function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_off_sysfile(struct super_block *sb, int type)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /* Disable only the limits. */
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
* we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
@@ -4657,10 +5374,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
bh = ext4_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
+ BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
@@ -4673,17 +5390,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
out:
- if (err) {
- mutex_unlock(&inode->i_mutex);
+ if (err)
return err;
- }
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ext4_mark_inode_dirty(handle, inode);
- mutex_unlock(&inode->i_mutex);
return len;
}
@@ -4696,14 +5409,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
}
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
static inline void register_as_ext2(void)
{
int err = register_filesystem(&ext2_fs_type);
@@ -4716,10 +5421,21 @@ static inline void unregister_as_ext2(void)
{
unregister_filesystem(&ext2_fs_type);
}
-MODULE_ALIAS("ext2");
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4735,10 +5451,23 @@ static inline void unregister_as_ext3(void)
{
unregister_filesystem(&ext3_fs_type);
}
-MODULE_ALIAS("ext3");
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
#else
static inline void register_as_ext3(void) { }
static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
#endif
static struct file_system_type ext4_fs_type = {
@@ -4748,8 +5477,9 @@ static struct file_system_type ext4_fs_type = {
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("ext4");
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
{
struct ext4_features *ef;
int ret = -ENOMEM;
@@ -4773,59 +5503,89 @@ out:
return ret;
}
+static void ext4_exit_feat_adverts(void)
+{
+ kobject_put(&ext4_feat->f_kobj);
+ wait_for_completion(&ext4_feat->f_kobj_unregister);
+ kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
static int __init ext4_init_fs(void)
{
- int err;
+ int i, err;
+ ext4_li_info = NULL;
+ mutex_init(&ext4_li_mtx);
+
+ /* Build-time check for flags consistency */
ext4_check_flag_values();
- err = ext4_init_pageio();
+
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+ mutex_init(&ext4__aio_mutex[i]);
+ init_waitqueue_head(&ext4__ioend_wq[i]);
+ }
+
+ err = ext4_init_es();
if (err)
return err;
+
+ err = ext4_init_pageio();
+ if (err)
+ goto out7;
+
err = ext4_init_system_zone();
if (err)
- goto out5;
+ goto out6;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset)
- goto out4;
+ if (!ext4_kset) {
+ err = -ENOMEM;
+ goto out5;
+ }
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = ext4_init_feat_adverts();
-
- err = ext4_init_mballoc();
if (err)
- goto out3;
+ goto out4;
- err = ext4_init_xattr();
+ err = ext4_init_mballoc();
if (err)
goto out2;
+ else
+ ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
- register_as_ext2();
register_as_ext3();
+ register_as_ext2();
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
- ext4_li_info = NULL;
- mutex_init(&ext4_li_mtx);
return 0;
out:
unregister_as_ext2();
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_exit_xattr();
-out2:
+ ext4_mballoc_ready = 0;
ext4_exit_mballoc();
-out3:
- kfree(ext4_feat);
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
+out2:
+ ext4_exit_feat_adverts();
out4:
- ext4_exit_system_zone();
+ if (ext4_proc_root)
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
out5:
+ ext4_exit_system_zone();
+out6:
ext4_exit_pageio();
+out7:
+ ext4_exit_es();
+
return err;
}
@@ -4836,12 +5596,13 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
- ext4_exit_xattr();
ext4_exit_mballoc();
+ ext4_exit_feat_adverts();
remove_proc_entry("fs/ext4", NULL);
kset_unregister(ext4_kset);
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_es();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff27..ff371193201 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 00000000000..011ba6670d9
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+ ext4_lblk_t needed;
+
+ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+ /* Give ourselves just enough room to cope with inodes in which
+ * i_blocks is corrupt: we've seen disk corruptions in the past
+ * which resulted in random data in an inode which looked enough
+ * like a regular file for ext4 to try to delete it. Things
+ * will go a bit crazy if that happens, but at least we should
+ * try not to panic the whole kernel. */
+ if (needed < 2)
+ needed = 2;
+
+ /* But we need to bound the transaction so we don't overflow the
+ * journal. */
+ if (needed > EXT4_MAX_TRANS_DATA)
+ needed = EXT4_MAX_TRANS_DATA;
+
+ return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b..e7387337060 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
#include "xattr.h"
#include "acl.h"
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -82,11 +77,11 @@
printk("\n"); \
} while (0)
#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct ext4_xattr_header *,
struct mb_cache_entry **);
@@ -95,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
static int ext4_xattr_list(struct dentry *dentry, char *buffer,
size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
-
static const struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
- [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -113,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- &ext4_xattr_acl_access_handler,
- &ext4_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
@@ -122,6 +115,59 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
+#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_mb_cache)
+
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 save_csum;
+ __le64 dsk_block_nr = cpu_to_le64(block_nr);
+
+ save_csum = hdr->h_checksum;
+ hdr->h_checksum = 0;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+ EXT4_BLOCK_SIZE(inode->i_sb));
+
+ hdr->h_checksum = save_csum;
+ return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+ return 0;
+ return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
static inline const struct xattr_handler *
ext4_xattr_handler(int name_index)
{
@@ -156,14 +202,21 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
}
static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
{
int error;
+ if (buffer_verified(bh))
+ return 0;
+
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
return -EIO;
+ if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+ return -EIO;
error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ if (!error)
+ set_buffer_verified(bh);
return error;
}
@@ -213,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct ext4_xattr_entry *entry;
size_t size;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -220,20 +274,21 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -ENODATA;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
if (error == -EIO)
@@ -255,7 +310,7 @@ cleanup:
return error;
}
-static int
+int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
@@ -314,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
+ if (strlen(name) > 255)
+ return -ERANGE;
+
down_read(&EXT4_I(inode)->xattr_sem);
error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size);
@@ -356,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -363,20 +422,21 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
error = 0;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
@@ -427,23 +487,23 @@ cleanup:
static int
ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- int i_error, b_error;
+ int ret, ret2;
down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
- i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
- if (i_error < 0) {
- b_error = 0;
- } else {
- if (buffer) {
- buffer += i_error;
- buffer_size -= i_error;
- }
- b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
- if (b_error < 0)
- i_error = 0;
+ ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ if (buffer) {
+ buffer += ret;
+ buffer_size -= ret;
}
+ ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ ret += ret2;
+errout:
up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
- return i_error + b_error;
+ return ret;
}
/*
@@ -456,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
return;
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
ext4_handle_dirty_super(handle, sb);
@@ -463,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
}
/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -472,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
{
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+ BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
@@ -484,21 +547,37 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
+ unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- error = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (ce)
+ mb_cache_entry_release(ce);
+ /*
+ * Beware of this ugliness: Releasing of xattr block references
+ * from different inodes can race and so we have to protect
+ * from a race where someone else frees the block (and releases
+ * its journal_head) before we are done dirtying the buffer. In
+ * nojournal mode this race is harmless and we actually cannot
+ * call ext4_handle_dirty_xattr_block() with locked buffer as
+ * that function can call sync_dirty_buffer() so for that case
+ * we handle the dirtying after unlocking the buffer.
+ */
+ if (ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
+ unlock_buffer(bh);
+ if (!ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
- if (ce)
- mb_cache_entry_release(ce);
}
- unlock_buffer(bh);
out:
ext4_std_error(inode->i_sb, error);
return;
@@ -512,31 +591,17 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- *total += EXT4_XATTR_LEN(last->e_name_len);
if (!last->e_value_block && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
}
+ if (total)
+ *total += EXT4_XATTR_LEN(last->e_name_len);
}
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
-struct ext4_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
-};
-
-struct ext4_xattr_search {
- struct ext4_xattr_entry *first;
- void *base;
- void *end;
- struct ext4_xattr_entry *here;
- int not_found;
-};
-
static int
ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
{
@@ -589,9 +654,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size. Just replace. */
s->here->e_value_size =
cpu_to_le32(i->value_len);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
return 0;
}
@@ -630,9 +700,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size_t size = EXT4_XATTR_SIZE(i->value_len);
void *val = s->base + min_offs - size;
s->here->e_value_offs = cpu_to_le16(min_offs - size);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear the pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear the pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
}
}
return 0;
@@ -662,7 +737,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
- if (ext4_xattr_check_block(bs->bh)) {
+ if (ext4_xattr_check_block(inode, bs->bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -695,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &bs->s;
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
#define header(x) ((struct ext4_xattr_header *)(x))
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
bs->bh->b_blocknr);
+ BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
goto cleanup;
@@ -719,15 +796,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!IS_LAST_ENTRY(s->first))
ext4_xattr_rehash(header(s->base),
s->here);
- ext4_xattr_cache_insert(bs->bh);
+ ext4_xattr_cache_insert(ext4_mb_cache,
+ bs->bh);
}
unlock_buffer(bs->bh);
if (error == -EIO)
goto bad_block;
if (!error)
- error = ext4_handle_dirty_metadata(handle,
- inode,
- bs->bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ bs->bh);
if (error)
goto cleanup;
goto inserted;
@@ -735,7 +813,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- jbd2_journal_release_buffer(handle, bs->bh);
if (ce) {
mb_cache_entry_release(ce);
ce = NULL;
@@ -784,9 +861,11 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = dquot_alloc_block(inode, 1);
+ error = dquot_alloc_block(inode,
+ EXT4_C2B(EXT4_SB(sb), 1));
if (error)
goto cleanup;
+ BUFFER_TRACE(new_bh, "get_write_access");
error = ext4_journal_get_write_access(handle,
new_bh);
if (error)
@@ -796,9 +875,9 @@ inserted:
ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
- error = ext4_handle_dirty_metadata(handle,
- inode,
- new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ new_bh);
if (error)
goto cleanup_dquot;
}
@@ -820,36 +899,44 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- block = ext4_new_meta_blocks(handle, inode,
- goal, NULL, &error);
+ /*
+ * take i_data_sem because we will test
+ * i_delalloc_reserved_flag in ext4_mb_new_blocks
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ block = ext4_new_meta_blocks(handle, inode, goal, 0,
+ NULL, &error);
+ up_read((&EXT4_I(inode)->i_data_sem));
if (error)
goto cleanup;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
- ea_idebug(inode, "creating block %d", block);
+ ea_idebug(inode, "creating block %llu",
+ (unsigned long long)block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
+ error = -ENOMEM;
getblk_failed:
- ext4_free_blocks(handle, inode, 0, block, 1,
+ ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA);
- error = -EIO;
goto cleanup;
}
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
unlock_buffer(new_bh);
+ error = -EIO;
goto getblk_failed;
}
memcpy(new_bh->b_data, s->base, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext4_xattr_cache_insert(new_bh);
- error = ext4_handle_dirty_metadata(handle,
- inode, new_bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode, new_bh);
if (error)
goto cleanup;
}
@@ -873,7 +960,7 @@ cleanup:
return error;
cleanup_dquot:
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
goto cleanup;
bad_block:
@@ -884,14 +971,8 @@ bad_block:
#undef header
}
-struct ext4_xattr_ibody_find {
- struct ext4_xattr_search s;
- struct ext4_iloc iloc;
-};
-
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
@@ -919,10 +1000,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-static int
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error) {
+ if (error == -ENOSPC &&
+ ext4_has_inline_data(inode)) {
+ error = ext4_try_to_evict_inline_data(handle, inode,
+ EXT4_XATTR_LEN(strlen(i->name) +
+ EXT4_XATTR_SIZE(i->value_len)));
+ if (error)
+ return error;
+ error = ext4_xattr_ibody_find(inode, i, is);
+ if (error)
+ return error;
+ error = ext4_xattr_set_entry(i, s);
+ }
+ if (error)
+ return error;
+ }
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
@@ -947,7 +1065,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
/*
* ext4_xattr_set_handle()
*
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode. Value
* is NULL to remove an existing extended attribute, and non-NULL to
* either replace an existing extended attribute, or create a new extended
* attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -985,11 +1103,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
- error = ext4_get_inode_loc(inode, &is.iloc);
- if (error)
- goto cleanup;
-
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_reserve_inode_write(handle, inode, &is.iloc);
if (error)
goto cleanup;
@@ -1082,9 +1196,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
+ int credits = ext4_jbd2_credits_xattr(inode);
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
@@ -1142,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_xattr_block_find *bs = NULL;
char *buffer = NULL, *b_entry_name = NULL;
size_t min_offs, free;
- int total_ino, total_blk;
+ int total_ino;
void *base, *start, *end;
int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1190,7 +1305,7 @@ retry:
error = -EIO;
if (!bh)
goto cleanup;
- if (ext4_xattr_check_block(bh)) {
+ if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -1200,8 +1315,7 @@ retry:
first = BFIRST(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
- free = ext4_xattr_free_space(first, &min_offs, base,
- &total_blk);
+ free = ext4_xattr_free_space(first, &min_offs, base, NULL);
if (free < new_extra_isize) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
@@ -1264,6 +1378,9 @@ retry:
s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
+ kfree(is); is = NULL;
+ kfree(bs); bs = NULL;
+ brelse(bh);
goto retry;
}
error = -1;
@@ -1406,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+ ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
if (!ce) {
ea_bdebug(bh, "out of memory");
return;
@@ -1484,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+ ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
hash);
while (ce) {
struct buffer_head *bh;
@@ -1587,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-ext4_init_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
{
- ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
- if (!ext4_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(name, HASH_BUCKET_BITS);
}
-void
-ext4_exit_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
- if (ext4_xattr_cache)
- mb_cache_destroy(ext4_xattr_cache);
- ext4_xattr_cache = NULL;
+ if (cache)
+ mb_cache_destroy(cache);
}
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b95..29bedf5589f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,13 +21,17 @@
#define EXT4_XATTR_INDEX_TRUSTED 4
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
+#define EXT4_XATTR_INDEX_SYSTEM 7
+#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __u32 h_reserved[4]; /* zero right now */
+ __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
+ /* id = inum if refcount=1, blknum otherwise */
+ __u32 h_reserved[3]; /* zero right now */
};
struct ext4_xattr_ibody_header {
@@ -63,12 +67,35 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern const struct xattr_handler ext4_xattr_acl_default_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -83,72 +110,26 @@ extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
-
extern const struct xattr_handler *ext4_xattr_handlers[];
-# else /* CONFIG_EXT4_FS_XATTR */
-
-static inline int
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext4_xattr_put_super(struct super_block *sb)
-{
-}
-
-static __init inline int
-ext4_init_xattr(void)
-{
- return 0;
-}
-
-static inline void
-ext4_exit_xattr(void)
-{
-}
-
-static inline int
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
- struct ext4_inode *raw_inode, handle_t *handle)
-{
- return -EOPNOTSUPP;
-}
-
-#define ext4_xattr_handlers NULL
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+ const char *name,
+ void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
-# endif /* CONFIG_EXT4_FS_XATTR */
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir);
+ struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir)
+ struct inode *dir, const struct qstr *qstr)
{
return 0;
}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121..d2a200624af 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,7 +3,6 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/security.h>
@@ -48,27 +47,33 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext4_initxattrs, handle);
+}
+
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 37e6ebca2cc..95f1f4ab59a 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,7 +5,6 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 98c375352d0..0edb7611ffb 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,7 +5,6 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include "ext4_jbd2.h"