aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig75
-rw-r--r--fs/ext4/Makefile15
-rw-r--r--fs/ext4/acl.c501
-rw-r--r--fs/ext4/acl.h25
-rw-r--r--fs/ext4/balloc.c2263
-rw-r--r--fs/ext4/bitmap.c98
-rw-r--r--fs/ext4/block_validity.c243
-rw-r--r--fs/ext4/dir.c511
-rw-r--r--fs/ext4/ext4.h2818
-rw-r--r--fs/ext4/ext4_extents.h273
-rw-r--r--fs/ext4/ext4_jbd2.c322
-rw-r--r--fs/ext4/ext4_jbd2.h450
-rw-r--r--fs/ext4/extents.c4705
-rw-r--r--fs/ext4/extents_status.c1127
-rw-r--r--fs/ext4/extents_status.h146
-rw-r--r--fs/ext4/file.c598
-rw-r--r--fs/ext4/fsync.c107
-rw-r--r--fs/ext4/group.h29
-rw-r--r--fs/ext4/hash.c95
-rw-r--r--fs/ext4/ialloc.c1192
-rw-r--r--fs/ext4/indirect.c1391
-rw-r--r--fs/ext4/inline.c2000
-rw-r--r--fs/ext4/inode.c5935
-rw-r--r--fs/ext4/ioctl.c651
-rw-r--r--fs/ext4/mballoc.c4116
-rw-r--r--fs/ext4/mballoc.h215
-rw-r--r--fs/ext4/migrate.c338
-rw-r--r--fs/ext4/mmp.c395
-rw-r--r--fs/ext4/move_extent.c1505
-rw-r--r--fs/ext4/namei.c2538
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/page-io.c497
-rw-r--r--fs/ext4/resize.c1869
-rw-r--r--fs/ext4/super.c5304
-rw-r--r--fs/ext4/symlink.c12
-rw-r--r--fs/ext4/truncate.h43
-rw-r--r--fs/ext4/xattr.c533
-rw-r--r--fs/ext4/xattr.h126
-rw-r--r--fs/ext4/xattr_security.c64
-rw-r--r--fs/ext4/xattr_trusted.c31
-rw-r--r--fs/ext4/xattr_user.c36
41 files changed, 32088 insertions, 11112 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000000..efea5d5c44c
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,75 @@
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
+ select JBD2
+ select CRC16
+ select CRYPTO
+ select CRYPTO_CRC32C
+ help
+ This is the next generation of the ext3 filesystem.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formatting a new filesystem as an ext4
+ filesystem initially.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4.
+
+ If unsure, say N.
+
+config EXT4_USE_FOR_EXT23
+ bool "Use ext4 for ext2/ext3 file systems"
+ depends on EXT4_FS
+ depends on EXT3_FS=n || EXT2_FS=n
+ default y
+ help
+ Allow the ext4 file system driver code to be used for ext2 or
+ ext3 file system mounts. This allows users to reduce their
+ compiled kernel size by using one file system driver for
+ ext2, ext3, and ext4 file systems.
+
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as:
+ echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2..0310fec2ee3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,13 @@
# Makefile for the linux ext4-filesystem routines.
#
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o
+ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a8bae8cd1d5..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -9,8 +9,8 @@
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
#include "xattr.h"
#include "acl.h"
@@ -37,37 +37,45 @@ ext4_acl_from_disk(const void *value, size_t size)
return ERR_PTR(-EINVAL);
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
- for (n=0; n < count; n++) {
+ for (n = 0; n < count; n++) {
ext4_acl_entry *entry =
(ext4_acl_entry *)value;
if ((char *)value + sizeof(ext4_acl_entry_short) > end)
goto fail;
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- switch(acl->a_entries[n].e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- value = (char *)value +
- sizeof(ext4_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
- break;
- case ACL_USER:
- case ACL_GROUP:
- value = (char *)value + sizeof(ext4_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
- break;
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ value = (char *)value +
+ sizeof(ext4_acl_entry_short);
+ break;
- default:
+ case ACL_USER:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
goto fail;
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+
+ default:
+ goto fail;
}
}
if (value != end)
@@ -91,32 +99,37 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
*size = ext4_acl_size(acl->a_count);
ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
- sizeof(ext4_acl_entry), GFP_KERNEL);
+ sizeof(ext4_acl_entry), GFP_NOFS);
if (!ext_acl)
return ERR_PTR(-ENOMEM);
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header);
- for (n=0; n < acl->a_count; n++) {
+ for (n = 0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext4_acl_entry *entry = (ext4_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch(acl->a_entries[n].e_tag) {
- case ACL_USER:
- case ACL_GROUP:
- entry->e_id =
- cpu_to_le32(acl->a_entries[n].e_id);
- e += sizeof(ext4_acl_entry);
- break;
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(ext4_acl_entry);
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
+ e += sizeof(ext4_acl_entry);
+ break;
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- e += sizeof(ext4_acl_entry_short);
- break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ e += sizeof(ext4_acl_entry_short);
+ break;
- default:
- goto fail;
+ default:
+ goto fail;
}
}
return (char *)ext_acl;
@@ -126,68 +139,32 @@ fail:
return ERR_PTR(-EINVAL);
}
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
- struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
-
- spin_lock(&inode->i_lock);
- if (*i_acl != EXT4_ACL_NOT_CACHED)
- acl = posix_acl_dup(*i_acl);
- spin_unlock(&inode->i_lock);
-
- return acl;
-}
-
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
- struct posix_acl *acl)
-{
- spin_lock(&inode->i_lock);
- if (*i_acl != EXT4_ACL_NOT_CACHED)
- posix_acl_release(*i_acl);
- *i_acl = posix_acl_dup(acl);
- spin_unlock(&inode->i_lock);
-}
-
/*
* Inode operation get_posix_acl().
*
* inode->i_mutex: don't care
*/
-static struct posix_acl *
+struct posix_acl *
ext4_get_acl(struct inode *inode, int type)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- switch(type) {
- case ACL_TYPE_ACCESS:
- acl = ext4_iget_acl(inode, &ei->i_acl);
- if (acl != EXT4_ACL_NOT_CACHED)
- return acl;
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
- break;
-
- case ACL_TYPE_DEFAULT:
- acl = ext4_iget_acl(inode, &ei->i_default_acl);
- if (acl != EXT4_ACL_NOT_CACHED)
- return acl;
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
- break;
-
- default:
- return ERR_PTR(-EINVAL);
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
}
retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
if (retval > 0) {
- value = kmalloc(retval, GFP_KERNEL);
+ value = kmalloc(retval, GFP_NOFS);
if (!value)
return ERR_PTR(-ENOMEM);
retval = ext4_xattr_get(inode, name_index, "", value, retval);
@@ -200,17 +177,9 @@ ext4_get_acl(struct inode *inode, int type)
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl)) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext4_iset_acl(inode, &ei->i_acl, acl);
- break;
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
- case ACL_TYPE_DEFAULT:
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
- break;
- }
- }
return acl;
}
@@ -220,43 +189,38 @@ ext4_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext4_new_inode
*/
static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
int name_index;
void *value = NULL;
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- switch(type) {
- case ACL_TYPE_ACCESS:
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
- if (error < 0)
- return error;
- else {
- inode->i_mode = mode;
- ext4_mark_inode_dirty(handle, inode);
- if (error == 0)
- acl = NULL;
- }
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (error < 0)
+ return error;
+ else {
+ inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (error == 0)
+ acl = NULL;
}
- break;
+ }
+ break;
- case ACL_TYPE_DEFAULT:
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
- default:
- return -EINVAL;
+ default:
+ return -EINVAL;
}
if (acl) {
value = ext4_acl_to_disk(acl, &size);
@@ -268,40 +232,29 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
value, size, 0);
kfree(value);
- if (!error) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext4_iset_acl(inode, &ei->i_acl, acl);
- break;
+ if (!error)
+ set_cached_acl(inode, type, acl);
- case ACL_TYPE_DEFAULT:
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
- break;
- }
- }
return error;
}
-static int
-ext4_check_acl(struct inode *inode, int mask)
+int
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return error;
- }
+ handle_t *handle;
+ int error, retries = 0;
- return -EAGAIN;
-}
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
-int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
- return generic_permission(inode, mask, ext4_check_acl);
+ error = __ext4_set_acl(handle, inode, type, acl);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ return error;
}
/*
@@ -313,239 +266,23 @@ ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current->fs->umask;
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
- if (S_ISDIR(inode->i_mode)) {
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_KERNEL);
- error = -ENOMEM;
- if (!clone)
- goto cleanup;
-
- mode = inode->i_mode;
- error = posix_acl_create_masq(clone, &mode);
- if (error >= 0) {
- inode->i_mode = mode;
- if (error > 0) {
- /* This is an extended ACL */
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_ACCESS, clone);
- }
- }
- posix_acl_release(clone);
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl, *clone;
+ struct posix_acl *default_acl, *acl;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- error = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!error) {
- handle_t *handle;
- int retries = 0;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
- retry:
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext4_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
- ext4_journal_stop(handle);
- if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
+ if (default_acl) {
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
}
-out:
- posix_acl_release(clone);
- return error;
-}
-
-/*
- * Extended attribute handlers
- */
-static size_t
-ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
-{
- struct posix_acl *acl;
- int error;
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext4_get_acl(inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext4_xattr_get_acl_access(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-
-static int
-ext4_xattr_get_acl_default(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-
-static int
-ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
- size_t size)
-{
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
-retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- error = ext4_set_acl(handle, inode, type, acl);
- ext4_journal_stop(handle);
- if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-
-static int
-ext4_xattr_set_acl_access(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-
-static int
-ext4_xattr_set_acl_default(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-
-struct xattr_handler ext4_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .list = ext4_xattr_list_acl_access,
- .get = ext4_xattr_get_acl_access,
- .set = ext4_xattr_set_acl_access,
-};
-
-struct xattr_handler ext4_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .list = ext4_xattr_list_acl_default,
- .get = ext4_xattr_get_acl_default,
- .set = ext4_xattr_set_acl_default,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf14..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,31 +51,22 @@ static inline int ext4_acl_count(size_t size)
}
}
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-
-/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
- if the ACL has not been cached */
-#define EXT4_ACL_NOT_CACHED ((void *)-1)
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
-#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext4_permission NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext4_get_acl NULL
+#define ext4_set_acl NULL
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
return 0;
}
-#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0737e05ba3d..fca382037dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -15,18 +15,40 @@
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "mballoc.h"
-#include "group.h"
+#include <trace/events/ext4.h>
+
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
- * Calculate the block group number and offset, given a block number
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block)
+{
+ ext4_group_t group;
+
+ if (test_opt2(sb, STD_GROUP_SIZE))
+ group = (block -
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
+ (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+ else
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ return group;
+}
+
+/*
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
*/
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -35,7 +57,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_grpblk_t offset;
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
- offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+ offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+ EXT4_SB(sb)->s_cluster_bits;
if (offsetp)
*offsetp = offset;
if (blockgrpp)
@@ -43,93 +66,195 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group, struct ext4_group_desc *gdp)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+ ext4_fsblk_t block,
+ ext4_group_t block_group)
+{
+ ext4_group_t actual_group;
+
+ actual_group = ext4_get_group_number(sb, block);
+ return (actual_group == block_group) ? 1 : 0;
+}
+
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
- unsigned long start;
- int bit, bit_max;
- unsigned free_blocks, group_blocks;
+ unsigned num_clusters;
+ int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+ ext4_fsblk_t itbl_blk;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (bh) {
- J_ASSERT_BH(bh, buffer_locked(bh));
-
- /* If checksum is bad mark all blocks used to prevent allocation
- * essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, __FUNCTION__,
- "Checksum bad for group %lu\n", block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
- memset(bh->b_data, 0xff, sb->s_blocksize);
- return 0;
+ /* This is the number of clusters used by the superblock,
+ * block group descriptors, and reserved block group
+ * descriptor blocks */
+ num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+ /*
+ * For the allocation bitmaps and inode table, we first need
+ * to check to see if the block is in the block group. If it
+ * is, then check to see if the cluster is already accounted
+ * for in the clusters used for the base metadata cluster, or
+ * if we can increment the base metadata cluster to include
+ * that block. Otherwise, we will have to track the cluster
+ * used for the allocation bitmap or inode table explicitly.
+ * Normally all of these blocks are contiguous, so the special
+ * case handling shouldn't be necessary except for *very*
+ * unusual file system layouts.
+ */
+ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+ block_cluster = EXT4_B2C(sbi,
+ ext4_block_bitmap(sb, gdp) - start);
+ if (block_cluster < num_clusters)
+ block_cluster = -1;
+ else if (block_cluster == num_clusters) {
+ num_clusters++;
+ block_cluster = -1;
}
- memset(bh->b_data, 0, sb->s_blocksize);
}
- /* Check for superblock and gdt backups in this group */
- bit_max = ext4_bg_has_super(sb, block_group);
+ if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+ inode_cluster = EXT4_B2C(sbi,
+ ext4_inode_bitmap(sb, gdp) - start);
+ if (inode_cluster < num_clusters)
+ inode_cluster = -1;
+ else if (inode_cluster == num_clusters) {
+ num_clusters++;
+ inode_cluster = -1;
+ }
+ }
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
- sbi->s_desc_per_block) {
- if (bit_max) {
- bit_max += ext4_bg_num_gdb(sb, block_group);
- bit_max +=
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ itbl_blk = ext4_inode_table(sb, gdp);
+ for (i = 0; i < sbi->s_itb_per_group; i++) {
+ if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+ c = EXT4_B2C(sbi, itbl_blk + i - start);
+ if ((c < num_clusters) || (c == inode_cluster) ||
+ (c == block_cluster) || (c == itbl_cluster))
+ continue;
+ if (c == num_clusters) {
+ num_clusters++;
+ continue;
+ }
+ num_clusters++;
+ itbl_cluster = c;
}
- } else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
}
- if (block_group == sbi->s_groups_count - 1) {
+ if (block_cluster != -1)
+ num_clusters++;
+ if (inode_cluster != -1)
+ num_clusters++;
+
+ return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ unsigned int blocks;
+
+ if (block_group == ext4_get_groups_count(sb) - 1) {
/*
- * Even though mke2fs always initialize first and last group
- * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
- * to make sure we calculate the right free blocks
+ * Even though mke2fs always initializes the first and
+ * last group, just in case some other tool was used,
+ * we need to make sure we calculate the right free
+ * blocks.
*/
- group_blocks = ext4_blocks_count(sbi->s_es) -
- le32_to_cpu(sbi->s_es->s_first_data_block) -
- (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
- } else {
- group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
+ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, block_group);
+ } else
+ blocks = EXT4_BLOCKS_PER_GROUP(sb);
+ return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
+
+/* Initializes an uninitialized block bitmap */
+static void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ unsigned int bit, bit_max;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start, tmp;
+ int flex_bg = 0;
+ struct ext4_group_info *grp;
+
+ J_ASSERT_BH(bh, buffer_locked(bh));
+
+ /* If checksum is bad mark all blocks used to prevent allocation
+ * essentially implementing a per-group read-only flag. */
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
}
+ memset(bh->b_data, 0, sb->s_blocksize);
- free_blocks = group_blocks - bit_max;
+ bit_max = ext4_num_base_meta_clusters(sb, block_group);
+ for (bit = 0; bit < bit_max; bit++)
+ ext4_set_bit(bit, bh->b_data);
- if (bh) {
- for (bit = 0; bit < bit_max; bit++)
- ext4_set_bit(bit, bh->b_data);
+ start = ext4_group_first_block_no(sb, block_group);
- start = block_group * EXT4_BLOCKS_PER_GROUP(sb) +
- le32_to_cpu(sbi->s_es->s_first_data_block);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flex_bg = 1;
- /* Set bits for block and inode bitmaps, and inode table */
- ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
- ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
- for (bit = (ext4_inode_table(sb, gdp) - start),
- bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
- ext4_set_bit(bit, bh->b_data);
+ /* Set bits for block and inode bitmaps, and inode table */
+ tmp = ext4_block_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
- /*
- * Also if the number of blocks within the group is
- * less than the blocksize * 8 ( which is the size
- * of bitmap ), set rest of the block bitmap to 1
- */
- mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+ tmp = ext4_inode_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
}
- return free_blocks - sbi->s_itb_per_group - 2;
+ /*
+ * Also if the number of blocks within the group is less than
+ * the blocksize * 8 ( which is the size of bitmap ), set rest
+ * of the block bitmap to 1
+ */
+ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+ sb->s_blocksize * 8, bh->b_data);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
}
+/* Return the number of free blocks in a block group. It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ return num_clusters_in_group(sb, block_group) -
+ ext4_num_overhead_clusters(sb, block_group, gdp);
+}
/*
* The free blocks are managed by bitmaps. A file system contains several
@@ -142,9 +267,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* when a file system is mounted (see ext4_fill_super).
*/
-
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/**
* ext4_get_group_desc() -- load group descriptor from disk
* @sb: super block
@@ -152,32 +274,29 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* @bh: pointer to the buffer head to store the block
* group descriptor
*/
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t block_group,
- struct buffer_head ** bh)
+ struct buffer_head **bh)
{
- unsigned long group_desc;
- unsigned long offset;
- struct ext4_group_desc * desc;
+ unsigned int group_desc;
+ unsigned int offset;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (block_group >= sbi->s_groups_count) {
- ext4_error (sb, "ext4_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %lu, groups_count = %lu",
- block_group, sbi->s_groups_count);
+ if (block_group >= ngroups) {
+ ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+ " groups_count = %u", block_group, ngroups);
return NULL;
}
- smp_rmb();
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
- ext4_error (sb, "ext4_get_group_desc",
- "Group descriptor not loaded - "
- "block_group = %lu, group_desc = %lu, desc = %lu",
- block_group, group_desc, offset);
+ ext4_error(sb, "Group descriptor not loaded - "
+ "block_group = %u, group_desc = %u, desc = %u",
+ block_group, group_desc, offset);
return NULL;
}
@@ -189,14 +308,19 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
return desc;
}
-static int ext4_valid_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- unsigned int block_group,
- struct buffer_head *bh)
+/*
+ * Return the block number which was discovered to be invalid, or 0 if
+ * the block bitmap is valid.
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
- ext4_fsblk_t bitmap_blk;
+ ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -206,43 +330,77 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
* or it has to also read the block group where the bitmaps
* are located to verify they are set.
*/
- return 1;
+ return 0;
}
group_first_block = ext4_group_first_block_no(sb, block_group);
/* check whether block bitmap block number is set */
- bitmap_blk = ext4_block_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ blk = ext4_block_bitmap(sb, desc);
+ offset = blk - group_first_block;
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode bitmap block number is set */
- bitmap_blk = ext4_inode_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ blk = ext4_inode_bitmap(sb, desc);
+ offset = blk - group_first_block;
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode table block number is set */
- bitmap_blk = ext4_inode_table(sb, desc);
- offset = bitmap_blk - group_first_block;
+ blk = ext4_inode_table(sb, desc);
+ offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- offset + EXT4_SB(sb)->s_itb_per_group,
- offset);
- if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
- /* good bitmap for inode tables */
- return 1;
-
-err_out:
- ext4_error(sb, __FUNCTION__,
- "Invalid block bitmap - "
- "block_group = %d, block = %llu",
- block_group, bitmap_blk);
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+ EXT4_B2C(sbi, offset));
+ if (next_zero_bit <
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
+ /* bad bitmap for inode tables */
+ return blk;
return 0;
}
+
+static void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (buffer_verified(bh))
+ return;
+
+ ext4_lock_group(sb, block_group);
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
+ }
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return;
+ }
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+}
+
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
@@ -252,10 +410,10 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
- struct ext4_group_desc * desc;
- struct buffer_head * bh = NULL;
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -264,1305 +422,161 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
- ext4_error(sb, __FUNCTION__,
- "Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %llu",
- (int)block_group, (unsigned long long)bitmap_blk);
+ ext4_error(sb, "Cannot get buffer for block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
- if (bh_uptodate_or_lock(bh))
- return bh;
+ if (bitmap_uptodate(bh))
+ goto verify;
+
+ lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ goto verify;
+ }
+ ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
+ ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
}
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
- ext4_error(sb, __FUNCTION__,
- "Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %llu",
- (int)block_group, (unsigned long long)bitmap_blk);
- return NULL;
- }
- if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
- put_bh(bh);
- return NULL;
- }
-
- return bh;
-}
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root: root of per-filesystem reservation rb tree
- * @verbose: verbose mode
- * @fn: function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end). Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
- const char *fn)
-{
- struct rb_node *n;
- struct ext4_reserve_window_node *rsv, *prev;
- int bad;
-
-restart:
- n = rb_first(root);
- bad = 0;
- prev = NULL;
-
- printk("Block Allocation Reservation Windows Map (%s):\n", fn);
- while (n) {
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
- if (verbose)
- printk("reservation window 0x%p "
- "start: %llu, end: %llu\n",
- rsv, rsv->rsv_start, rsv->rsv_end);
- if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
- printk("Bad reservation %p (start >= end)\n",
- rsv);
- bad = 1;
- }
- if (prev && prev->rsv_end >= rsv->rsv_start) {
- printk("Bad reservation %p (prev->end >= start)\n",
- rsv);
- bad = 1;
- }
- if (bad) {
- if (!verbose) {
- printk("Restarting reservation walk in verbose mode\n");
- verbose = 1;
- goto restart;
- }
- }
- n = rb_next(n);
- prev = rsv;
- }
- printk("Window map complete.\n");
- if (bad)
- BUG();
-}
-#define rsv_window_dump(root, verbose) \
- __rsv_window_dump((root), (verbose), __FUNCTION__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv: inode's reservation window
- * @grp_goal: given goal block relative to the allocation block group
- * @group: the current allocation block group
- * @sb: filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
- ext4_group_t group, struct super_block *sb)
-{
- ext4_fsblk_t group_first_block, group_last_block;
-
- group_first_block = ext4_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
- if ((rsv->_rsv_start > group_last_block) ||
- (rsv->_rsv_end < group_first_block))
- return 0;
- if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
- || (grp_goal + group_first_block > rsv->_rsv_end)))
- return 0;
- return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root: root of reservation tree
- * @goal: target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
- struct rb_node *n = root->rb_node;
- struct ext4_reserve_window_node *rsv;
-
- if (!n)
- return NULL;
-
- do {
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
- if (goal < rsv->rsv_start)
- n = n->rb_left;
- else if (goal > rsv->rsv_end)
- n = n->rb_right;
- else
- return rsv;
- } while (n);
- /*
- * We've fallen off the end of the tree: the goal wasn't inside
- * any particular node. OK, the previous node must be to one
- * side of the interval containing the goal. If it's the RHS,
- * we need to back up one.
- */
- if (rsv->rsv_start > goal) {
- n = rb_prev(&rsv->rsv_node);
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
- }
- return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb: super block
- * @rsv: reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
- struct ext4_reserve_window_node *rsv)
-{
- struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
- struct rb_node *node = &rsv->rsv_node;
- ext4_fsblk_t start = rsv->rsv_start;
-
- struct rb_node ** p = &root->rb_node;
- struct rb_node * parent = NULL;
- struct ext4_reserve_window_node *this;
-
- while (*p)
- {
- parent = *p;
- this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
- if (start < this->rsv_start)
- p = &(*p)->rb_left;
- else if (start > this->rsv_end)
- p = &(*p)->rb_right;
- else {
- rsv_window_dump(root, 1);
- BUG();
- }
- }
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb: super block
- * @rsv: reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
- struct ext4_reserve_window_node *rsv)
-{
- rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_alloc_hit = 0;
- rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv: given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
- /* a valid reservation end block could not be 0 */
- return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode: file inode structure
- *
- * Allocate and initialize the reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
- struct super_block *sb = inode->i_sb;
-
- block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
- if (block_i) {
- struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
- rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
- /*
- * if filesystem is mounted with NORESERVATION, the goal
- * reservation window size is set to zero to indicate
- * block reservation is off
- */
- if (!test_opt(sb, RESERVATION))
- rsv->rsv_goal_size = 0;
- else
- rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
- rsv->rsv_alloc_hit = 0;
- block_i->last_alloc_logical_block = 0;
- block_i->last_alloc_physical_block = 0;
- }
- ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode: inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- * ext4_release_file(): last writer close the file
- * ext4_clear_inode(): last iput(), when nobody link to this file.
- * ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
- struct ext4_reserve_window_node *rsv;
- spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
- ext4_mb_discard_inode_preallocations(inode);
-
- if (!block_i)
- return;
-
- rsv = &block_i->rsv_window_node;
- if (!rsv_is_empty(&rsv->rsv_window)) {
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&rsv->rsv_window))
- rsv_window_remove(inode->i_sb, rsv);
- spin_unlock(rsv_lock);
- }
-}
-
-/**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
- * @handle: handle to this transaction
- * @sb: super block
- * @block: start physcial block to free
- * @count: number of blocks to free
- * @pdquot_freed_blocks: pointer to quota
- */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
- ext4_group_t block_group;
- ext4_grpblk_t bit;
- unsigned long i;
- unsigned long overflow;
- struct ext4_group_desc * desc;
- struct ext4_super_block * es;
- struct ext4_sb_info *sbi;
- int err = 0, ret;
- ext4_grpblk_t group_freed;
-
- *pdquot_freed_blocks = 0;
- sbi = EXT4_SB(sb);
- es = sbi->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
- ext4_error (sb, "ext4_free_blocks",
- "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
-
-do_more:
- overflow = 0;
- ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- /*
- * Check to see if we are freeing blocks across a group
- * boundary.
- */
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
- count -= overflow;
- }
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
- desc = ext4_get_group_desc (sb, block_group, &gd_bh);
- if (!desc)
- goto error_return;
-
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
- ext4_error (sb, "ext4_free_blocks",
- "Freeing blocks in system zones - "
- "Block = %llu, count = %lu",
- block, count);
- goto error_return;
- }
-
- /*
- * We are about to start releasing blocks in the bitmap,
- * so we need undo access.
- */
- /* @@@ check errors */
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
-
- jbd_lock_bh_state(bitmap_bh);
-
- for (i = 0, group_freed = 0; i < count; i++) {
+ ext4_unlock_group(sb, block_group);
+ if (buffer_uptodate(bh)) {
/*
- * An HJ special. This is expensive...
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
*/
-#ifdef CONFIG_JBD2_DEBUG
- jbd_unlock_bh_state(bitmap_bh);
- {
- struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "Deleted!");
- if (!bh2jh(bitmap_bh)->b_committed_data)
- BUFFER_TRACE(debug_bh,
- "No commited data in bitmap");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
- __brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
-#endif
- if (need_resched()) {
- jbd_unlock_bh_state(bitmap_bh);
- cond_resched();
- jbd_lock_bh_state(bitmap_bh);
- }
- /* @@@ This prevents newly-allocated data from being
- * freed and then reallocated within the same
- * transaction.
- *
- * Ideally we would want to allow that to happen, but to
- * do so requires making jbd2_journal_forget() capable of
- * revoking the queued write of a data block, which
- * implies blocking on the journal lock. *forget()
- * cannot block due to truncate races.
- *
- * Eventually we can fix this by making jbd2_journal_forget()
- * return a status indicating whether or not it was able
- * to revoke the buffer. On successful revoke, it is
- * safe not to set the allocation bit in the committed
- * bitmap, because we know that there is no outstanding
- * activity on the buffer any more and so it is safe to
- * reallocate it.
- */
- BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
- J_ASSERT_BH(bitmap_bh,
- bh2jh(bitmap_bh)->b_committed_data != NULL);
- ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
- bh2jh(bitmap_bh)->b_committed_data);
-
- /*
- * We clear the bit in the bitmap after setting the committed
- * data bit, because this is the reverse order to that which
- * the allocator uses.
- */
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
- bit + i, bitmap_bh->b_data)) {
- jbd_unlock_bh_state(bitmap_bh);
- ext4_error(sb, __FUNCTION__,
- "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- jbd_lock_bh_state(bitmap_bh);
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- group_freed++;
- }
- }
- jbd_unlock_bh_state(bitmap_bh);
-
- spin_lock(sb_bgl_lock(sbi, block_group));
- desc->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
- group_freed);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
- spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_journal_dirty_metadata(handle, gd_bh);
- if (!err) err = ret;
- *pdquot_freed_blocks += group_freed;
-
- if (overflow && !err) {
- block += count;
- count = overflow;
- goto do_more;
- }
- sb->s_dirt = 1;
-error_return:
- brelse(bitmap_bh);
- ext4_std_error(sb, err);
- return;
-}
-
-/**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle: handle for this transaction
- * @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
- * @metadata: Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count,
- int metadata)
-{
- struct super_block * sb;
- unsigned long dquot_freed_blocks;
-
- /* this isn't the right place to decide whether block is metadata
- * inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
- ext4_should_journal_data(inode))
- metadata = 1;
-
- sb = inode->i_sb;
-
- if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
- ext4_free_blocks_sb(handle, sb, block, count,
- &dquot_freed_blocks);
- else
- ext4_mb_free_blocks(handle, inode, block, count,
- metadata, &dquot_freed_blocks);
- if (dquot_freed_blocks)
- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
- return;
-}
-
-/**
- * ext4_test_allocatable()
- * @nr: given allocation block group
- * @bh: bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
-{
- int ret;
- struct journal_head *jh = bh2jh(bh);
-
- if (ext4_test_bit(nr, bh->b_data))
- return 0;
-
- jbd_lock_bh_state(bh);
- if (!jh->b_committed_data)
- ret = 1;
- else
- ret = !ext4_test_bit(nr, jh->b_committed_data);
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start: the starting block (group relative) of the search
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
- ext4_grpblk_t maxblocks)
-{
- ext4_grpblk_t next;
- struct journal_head *jh = bh2jh(bh);
-
- while (start < maxblocks) {
- next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
- if (next >= maxblocks)
- return -1;
- if (ext4_test_allocatable(next, bh))
- return next;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data)
- start = ext4_find_next_zero_bit(jh->b_committed_data,
- maxblocks, next);
- jbd_unlock_bh_state(bh);
- }
- return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start: the starting block (group relative) to find next
- * allocatable block in bitmap.
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap. We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
- ext4_grpblk_t maxblocks)
-{
- ext4_grpblk_t here, next;
- char *p, *r;
-
- if (start > 0) {
- /*
- * The goal was occupied; search forward for a free
- * block within the next XX blocks.
- *
- * end_goal is more or less random, but it has to be
- * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
- * next 64-bit boundary is simple..
- */
- ext4_grpblk_t end_goal = (start + 63) & ~63;
- if (end_goal > maxblocks)
- end_goal = maxblocks;
- here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
- if (here < end_goal && ext4_test_allocatable(here, bh))
- return here;
- ext4_debug("Bit not found near goal\n");
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ goto verify;
}
-
- here = start;
- if (here < 0)
- here = 0;
-
- p = ((char *)bh->b_data) + (here >> 3);
- r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
- next = (r - ((char *)bh->b_data)) << 3;
-
- if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
- return next;
-
/*
- * The bitmap search --- search forward alternately through the actual
- * bitmap and the last-committed copy until we find a bit free in
- * both
+ * submit the buffer_head for reading
*/
- here = bitmap_search_next_usable_block(here, bh, maxblocks);
- return here;
+ set_buffer_new(bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ return bh;
+verify:
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ if (buffer_verified(bh))
+ return bh;
+ put_bh(bh);
+ return NULL;
}
-/**
- * claim_block()
- * @block: the free block (group relative) to allocate
- * @bh: the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap. Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data. If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head *bh)
{
- struct journal_head *jh = bh2jh(bh);
- int ret;
+ struct ext4_group_desc *desc;
- if (ext4_set_bit_atomic(lock, block, bh->b_data))
+ if (!buffer_new(bh))
return 0;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
- ext4_clear_bit_atomic(lock, block, bh->b_data);
- ret = 0;
- } else {
- ret = 1;
- }
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @count: target number of blocks to allocate
- * @my_rsv: reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- * if there is a reservation window, only try to allocate block(s) from the
- * file's own reservation window;
- * Otherwise, the allocation range starts from the give goal block, ends at
- * the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap. In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
- ext4_group_t group, struct buffer_head *bitmap_bh,
- ext4_grpblk_t grp_goal, unsigned long *count,
- struct ext4_reserve_window *my_rsv)
-{
- ext4_fsblk_t group_first_block;
- ext4_grpblk_t start, end;
- unsigned long num = 0;
-
- /* we do allocation within the reservation window if we have a window */
- if (my_rsv) {
- group_first_block = ext4_group_first_block_no(sb, group);
- if (my_rsv->_rsv_start >= group_first_block)
- start = my_rsv->_rsv_start - group_first_block;
- else
- /* reservation window cross group boundary */
- start = 0;
- end = my_rsv->_rsv_end - group_first_block + 1;
- if (end > EXT4_BLOCKS_PER_GROUP(sb))
- /* reservation window crosses group boundary */
- end = EXT4_BLOCKS_PER_GROUP(sb);
- if ((start <= grp_goal) && (grp_goal < end))
- start = grp_goal;
- else
- grp_goal = -1;
- } else {
- if (grp_goal > 0)
- start = grp_goal;
- else
- start = 0;
- end = EXT4_BLOCKS_PER_GROUP(sb);
- }
-
- BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
- if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
- grp_goal = find_next_usable_block(start, bitmap_bh, end);
- if (grp_goal < 0)
- goto fail_access;
- if (!my_rsv) {
- int i;
-
- for (i = 0; i < 7 && grp_goal > start &&
- ext4_test_allocatable(grp_goal - 1,
- bitmap_bh);
- i++, grp_goal--)
- ;
- }
- }
- start = grp_goal;
-
- if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
- grp_goal, bitmap_bh)) {
- /*
- * The block was allocated by another thread, or it was
- * allocated and then freed by another thread
- */
- start++;
- grp_goal++;
- if (start >= end)
- goto fail_access;
- goto repeat;
- }
- num++;
- grp_goal++;
- while (num < *count && grp_goal < end
- && ext4_test_allocatable(grp_goal, bitmap_bh)
- && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
- grp_goal, bitmap_bh)) {
- num++;
- grp_goal++;
- }
- *count = num;
- return grp_goal - num;
-fail_access:
- *count = num;
- return -1;
-}
-
-/**
- * find_next_reservable_window():
- * find a reservable space within the given range.
- * It does not allocate the reservation window for now:
- * alloc_new_reservation() will do the work later.
- *
- * @search_head: the head of the searching list;
- * This is not necessarily the list head of the whole filesystem
- *
- * We have both head and start_block to assist the search
- * for the reservable space. The list starts from head,
- * but we will shift to the place where start_block is,
- * then start from there, when looking for a reservable space.
- *
- * @size: the target new reservation window size
- *
- * @group_first_block: the first block we consider to start
- * the real search from
- *
- * @last_block:
- * the maximum block number that our goal reservable space
- * could start from. This is normally the last block in this
- * group. The search will end when we found the start of next
- * possible reservable space is out of this boundary.
- * This could handle the cross boundary reservation window
- * request.
- *
- * basically we search from the given range, rather than the whole
- * reservation double linked list, (start_block, last_block)
- * to find a free region that is of my size and has not
- * been reserved.
- *
- */
-static int find_next_reservable_window(
- struct ext4_reserve_window_node *search_head,
- struct ext4_reserve_window_node *my_rsv,
- struct super_block * sb,
- ext4_fsblk_t start_block,
- ext4_fsblk_t last_block)
-{
- struct rb_node *next;
- struct ext4_reserve_window_node *rsv, *prev;
- ext4_fsblk_t cur;
- int size = my_rsv->rsv_goal_size;
-
- /* TODO: make the start of the reservation window byte-aligned */
- /* cur = *start_block & ~7;*/
- cur = start_block;
- rsv = search_head;
- if (!rsv)
- return -1;
-
- while (1) {
- if (cur <= rsv->rsv_end)
- cur = rsv->rsv_end + 1;
-
- /* TODO?
- * in the case we could not find a reservable space
- * that is what is expected, during the re-search, we could
- * remember what's the largest reservable space we could have
- * and return that one.
- *
- * For now it will fail if we could not find the reservable
- * space with expected-size (or more)...
- */
- if (cur > last_block)
- return -1; /* fail */
-
- prev = rsv;
- next = rb_next(&rsv->rsv_node);
- rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
-
- /*
- * Reached the last reservation, we can just append to the
- * previous one.
- */
- if (!next)
- break;
-
- if (cur + size <= rsv->rsv_start) {
- /*
- * Found a reserveable space big enough. We could
- * have a reservation across the group boundary here
- */
- break;
- }
- }
- /*
- * we come here either :
- * when we reach the end of the whole list,
- * and there is empty reservable space after last entry in the list.
- * append it to the end of the list.
- *
- * or we found one reservable space in the middle of the list,
- * return the reservation window that we could append to.
- * succeed.
- */
-
- if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
- rsv_window_remove(sb, my_rsv);
-
- /*
- * Let's book the whole avaliable window for now. We will check the
- * disk bitmap later and then, if there are free blocks then we adjust
- * the window size if it's larger than requested.
- * Otherwise, we will remove this node from the tree next time
- * call find_next_reservable_window.
- */
- my_rsv->rsv_start = cur;
- my_rsv->rsv_end = cur + size - 1;
- my_rsv->rsv_alloc_hit = 0;
-
- if (prev != my_rsv)
- ext4_rsv_window_add(sb, my_rsv);
-
- return 0;
-}
-
-/**
- * alloc_new_reservation()--allocate a new reservation window
- *
- * To make a new reservation, we search part of the filesystem
- * reservation list (the list that inside the group). We try to
- * allocate a new reservation window near the allocation goal,
- * or the beginning of the group, if there is no goal.
- *
- * We first find a reservable space after the goal, then from
- * there, we check the bitmap for the first free block after
- * it. If there is no free block until the end of group, then the
- * whole group is full, we failed. Otherwise, check if the free
- * block is inside the expected reservable space, if so, we
- * succeed.
- * If the first free block is outside the reservable space, then
- * start from the first free block, we search for next available
- * space, and go on.
- *
- * on succeed, a new reservation will be found and inserted into the list
- * It contains at least one free block, and it does not overlap with other
- * reservation windows.
- *
- * failed: we failed to find a reservation window in this group
- *
- * @rsv: the reservation
- *
- * @grp_goal: The goal (group-relative). It is where the search for a
- * free reservable space should start from.
- * if we have a grp_goal(grp_goal >0 ), then start from there,
- * no grp_goal(grp_goal = -1), we start from the first block
- * of the group.
- *
- * @sb: the super block
- * @group: the group we are trying to allocate in
- * @bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
- ext4_grpblk_t grp_goal, struct super_block *sb,
- ext4_group_t group, struct buffer_head *bitmap_bh)
-{
- struct ext4_reserve_window_node *search_head;
- ext4_fsblk_t group_first_block, group_end_block, start_block;
- ext4_grpblk_t first_free_block;
- struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
- unsigned long size;
- int ret;
- spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
- group_first_block = ext4_group_first_block_no(sb, group);
- group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
- if (grp_goal < 0)
- start_block = group_first_block;
- else
- start_block = grp_goal + group_first_block;
-
- size = my_rsv->rsv_goal_size;
-
- if (!rsv_is_empty(&my_rsv->rsv_window)) {
- /*
- * if the old reservation is cross group boundary
- * and if the goal is inside the old reservation window,
- * we will come here when we just failed to allocate from
- * the first part of the window. We still have another part
- * that belongs to the next group. In this case, there is no
- * point to discard our window and try to allocate a new one
- * in this group(which will fail). we should
- * keep the reservation window, just simply move on.
- *
- * Maybe we could shift the start block of the reservation
- * window to the first block of next group.
- */
-
- if ((my_rsv->rsv_start <= group_end_block) &&
- (my_rsv->rsv_end > group_end_block) &&
- (start_block >= my_rsv->rsv_start))
- return -1;
-
- if ((my_rsv->rsv_alloc_hit >
- (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
- /*
- * if the previously allocation hit ratio is
- * greater than 1/2, then we double the size of
- * the reservation window the next time,
- * otherwise we keep the same size window
- */
- size = size * 2;
- if (size > EXT4_MAX_RESERVE_BLOCKS)
- size = EXT4_MAX_RESERVE_BLOCKS;
- my_rsv->rsv_goal_size= size;
- }
- }
-
- spin_lock(rsv_lock);
- /*
- * shift the search start to the window near the goal block
- */
- search_head = search_reserve_window(fs_rsv_root, start_block);
-
- /*
- * find_next_reservable_window() simply finds a reservable window
- * inside the given range(start_block, group_end_block).
- *
- * To make sure the reservation window has a free bit inside it, we
- * need to check the bitmap after we found a reservable window.
- */
-retry:
- ret = find_next_reservable_window(search_head, my_rsv, sb,
- start_block, group_end_block);
-
- if (ret == -1) {
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1;
- }
-
- /*
- * On success, find_next_reservable_window() returns the
- * reservation window where there is a reservable space after it.
- * Before we reserve this reservable space, we need
- * to make sure there is at least a free block inside this region.
- *
- * searching the first free bit on the block bitmap and copy of
- * last committed bitmap alternatively, until we found a allocatable
- * block. Search start from the start block of the reservable space
- * we just found.
- */
- spin_unlock(rsv_lock);
- first_free_block = bitmap_search_next_usable_block(
- my_rsv->rsv_start - group_first_block,
- bitmap_bh, group_end_block - group_first_block + 1);
-
- if (first_free_block < 0) {
- /*
- * no free block left on the bitmap, no point
- * to reserve the space. return failed.
- */
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1; /* failed */
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ return 1;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ ext4_error(sb, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
+ return 1;
}
-
- start_block = first_free_block + group_first_block;
- /*
- * check if the first free block is within the
- * free space we just reserved
- */
- if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
- return 0; /* success */
- /*
- * if the first free bit we found is out of the reservable space
- * continue search for next reservable space,
- * start from where the free block is,
- * we also shift the list head to where we stopped last time
- */
- search_head = my_rsv;
- spin_lock(rsv_lock);
- goto retry;
+ clear_buffer_new(bh);
+ /* Panic or remount fs read-only if block bitmap is invalid */
+ ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ /* ...but check for error just in case errors=continue. */
+ return !buffer_verified(bh);
}
-/**
- * try_to_extend_reservation()
- * @my_rsv: given reservation window
- * @sb: super block
- * @size: the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
- struct super_block *sb, int size)
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
- struct ext4_reserve_window_node *next_rsv;
- struct rb_node *next;
- spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
- if (!spin_trylock(rsv_lock))
- return;
-
- next = rb_next(&my_rsv->rsv_node);
+ struct buffer_head *bh;
- if (!next)
- my_rsv->rsv_end += size;
- else {
- next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
- if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
- my_rsv->rsv_end += size;
- else
- my_rsv->rsv_end = next_rsv->rsv_start - 1;
+ bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (!bh)
+ return NULL;
+ if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ put_bh(bh);
+ return NULL;
}
- spin_unlock(rsv_lock);
+ return bh;
}
/**
- * ext4_try_to_allocate_with_rsv()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @count: target number of blocks to allocate
- * @my_rsv: reservation window
- * @errp: pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation. If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
+ * ext4_has_free_clusters()
+ * @sbi: in-core super block structure.
+ * @nclusters: number of needed blocks
+ * @flags: flags from ext4_mb_new_blocks()
*
+ * Check if filesystem has nclusters free & available for allocation.
+ * On success return 1, return 0 on failure.
*/
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
- ext4_group_t group, struct buffer_head *bitmap_bh,
- ext4_grpblk_t grp_goal,
- struct ext4_reserve_window_node * my_rsv,
- unsigned long *count, int *errp)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- ext4_fsblk_t group_first_block, group_last_block;
- ext4_grpblk_t ret = 0;
- int fatal;
- unsigned long num = *count;
+ s64 free_clusters, dirty_clusters, rsv, resv_clusters;
+ struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+ struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
- *errp = 0;
+ free_clusters = percpu_counter_read_positive(fcc);
+ dirty_clusters = percpu_counter_read_positive(dcc);
+ resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
- * Make sure we use undo access for the bitmap, because it is critical
- * that we do the frozen_data COW on bitmap buffers in all cases even
- * if the buffer is in BJ_Forget state in the committing transaction.
+ * r_blocks_count should always be multiple of the cluster ratio so
+ * we are safe to do a plane bit shift only.
*/
- BUFFER_TRACE(bitmap_bh, "get undo access for new block");
- fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
+ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+ resv_clusters;
- /*
- * we don't deal with reservation when
- * filesystem is mounted without reservation
- * or the file is not a regular file
- * or last attempt to allocate a block with reservation turned on failed
- */
- if (my_rsv == NULL ) {
- ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, count, NULL);
- goto out;
+ if (free_clusters - (nclusters + rsv + dirty_clusters) <
+ EXT4_FREECLUSTERS_WATERMARK) {
+ free_clusters = percpu_counter_sum_positive(fcc);
+ dirty_clusters = percpu_counter_sum_positive(dcc);
}
- /*
- * grp_goal is a group relative block number (if there is a goal)
- * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
- * first block is a filesystem wide block number
- * first block is the block number of the first block in this group
+ /* Check whether we have space after accounting for current
+ * dirty clusters & root reserved clusters.
*/
- group_first_block = ext4_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ if (free_clusters >= (rsv + nclusters + dirty_clusters))
+ return 1;
- /*
- * Basically we will allocate a new block from inode's reservation
- * window.
- *
- * We need to allocate a new reservation window, if:
- * a) inode does not have a reservation window; or
- * b) last attempt to allocate a block from existing reservation
- * failed; or
- * c) we come here with a goal and with a reservation window
- *
- * We do not need to allocate a new reservation window if we come here
- * at the beginning with a goal and the goal is inside the window, or
- * we don't have a goal but already have a reservation window.
- * then we could go to allocate from the reservation window directly.
- */
- while (1) {
- if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
- !goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb)) {
- if (my_rsv->rsv_goal_size < *count)
- my_rsv->rsv_goal_size = *count;
- ret = alloc_new_reservation(my_rsv, grp_goal, sb,
- group, bitmap_bh);
- if (ret < 0)
- break; /* failed */
-
- if (!goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb))
- grp_goal = -1;
- } else if (grp_goal >= 0) {
- int curr = my_rsv->rsv_end -
- (grp_goal + group_first_block) + 1;
-
- if (curr < *count)
- try_to_extend_reservation(my_rsv, sb,
- *count - curr);
- }
+ /* Hm, nope. Are (enough) root reserved clusters available? */
+ if (uid_eq(sbi->s_resuid, current_fsuid()) ||
+ (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE) ||
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
- if ((my_rsv->rsv_start > group_last_block) ||
- (my_rsv->rsv_end < group_first_block)) {
- rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
- BUG();
- }
- ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, &num, &my_rsv->rsv_window);
- if (ret >= 0) {
- my_rsv->rsv_alloc_hit += num;
- *count = num;
- break; /* succeed */
- }
- num = *count;
+ if (free_clusters >= (nclusters + dirty_clusters +
+ resv_clusters))
+ return 1;
}
-out:
- if (ret >= 0) {
- BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
- "bitmap block");
- fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
- return ret;
+ /* No free blocks. Let's see if we can dip into reserved pool */
+ if (flags & EXT4_MB_USE_RESERVED) {
+ if (free_clusters >= (nclusters + dirty_clusters))
+ return 1;
}
- BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
- ext4_journal_release_buffer(handle, bitmap_bh);
- return ret;
+ return 0;
}
-/**
- * ext4_has_free_blocks()
- * @sbi: in-core super block structure.
- *
- * Check if filesystem has at least 1 free block available for allocation.
- */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
{
- ext4_fsblk_t free_blocks, root_blocks;
-
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
return 0;
- }
- return 1;
+ } else
+ return -ENOSPC;
}
/**
@@ -1572,14 +586,16 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
* return TRUE.
*
* if the total number of retries exceed three times, return FALSE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
+ (*retries)++ > 3 ||
+ !EXT4_SB(sb)->s_journal)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1587,336 +603,67 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
}
-/**
- * ext4_new_blocks_old() -- core block(s) allocation function
- * @handle: handle to this transaction
- * @inode: file inode
- * @goal: given target block(filesystem wide)
- * @count: target number of blocks to allocate
- * @errp: error code
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: pointer to total number of clusters needed
+ * @errp: error code
*
+ * Return 1st allocated block number on success, *count stores total account
+ * error stores in errp pointer
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gdp_bh;
- ext4_group_t group_no;
- ext4_group_t goal_group;
- ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
- ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
- ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
- ext4_group_t bgi; /* blockgroup iteration index */
- int fatal = 0, err;
- int performed_allocation = 0;
- ext4_grpblk_t free_blocks; /* number of free blocks in a group */
- struct super_block *sb;
- struct ext4_group_desc *gdp;
- struct ext4_super_block *es;
- struct ext4_sb_info *sbi;
- struct ext4_reserve_window_node *my_rsv = NULL;
- struct ext4_block_alloc_info *block_i;
- unsigned short windowsz = 0;
- ext4_group_t ngroups;
- unsigned long num = *count;
-
- *errp = -ENOSPC;
- sb = inode->i_sb;
- if (!sb) {
- printk("ext4_new_block: nonexistent device");
- return 0;
- }
-
- /*
- * Check quota for allocation of this block.
- */
- if (DQUOT_ALLOC_BLOCK(inode, num)) {
- *errp = -EDQUOT;
- return 0;
- }
-
- sbi = EXT4_SB(sb);
- es = EXT4_SB(sb)->s_es;
- ext4_debug("goal=%llu.\n", goal);
- /*
- * Allocate a block from reservation only when
- * filesystem is mounted with reservation(default,-o reservation), and
- * it's a regular file, and
- * the desired window size is greater than 0 (One could use ioctl
- * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
- * reservation on that particular file)
- */
- block_i = EXT4_I(inode)->i_block_alloc_info;
- if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
- my_rsv = &block_i->rsv_window_node;
-
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
- /*
- * First, test whether the goal block is free.
- */
- if (goal < le32_to_cpu(es->s_first_data_block) ||
- goal >= ext4_blocks_count(es))
- goal = le32_to_cpu(es->s_first_data_block);
- ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
- goal_group = group_no;
-retry_alloc:
- gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
-
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * if there is not enough free blocks to make a new resevation
- * turn off reservation for this allocation
- */
- if (my_rsv && (free_blocks < windowsz)
- && (rsv_is_empty(&my_rsv->rsv_window)))
- my_rsv = NULL;
-
- if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, grp_target_blk,
- my_rsv, &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
-
- ngroups = EXT4_SB(sb)->s_groups_count;
- smp_rmb();
-
- /*
- * Now search the rest of the groups. We assume that
- * group_no and gdp correctly point to the last group visited.
- */
- for (bgi = 0; bgi < ngroups; bgi++) {
- group_no++;
- if (group_no >= ngroups)
- group_no = 0;
- gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * skip this group if the number of
- * free blocks is less than half of the reservation
- * window size.
- */
- if (free_blocks <= (windowsz/2))
- continue;
-
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- /*
- * try to allocate block(s) from this group, without a goal(-1).
- */
- grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, -1, my_rsv,
- &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
- /*
- * We may end up a bogus ealier ENOSPC error due to
- * filesystem is "full" of reservations, but
- * there maybe indeed free blocks avaliable on disk
- * In this case, we just forget about the reservations
- * just do block allocation as without reservations.
- */
- if (my_rsv) {
- my_rsv = NULL;
- windowsz = 0;
- group_no = goal_group;
- goto retry_alloc;
- }
- /* No space left on the device */
- *errp = -ENOSPC;
- goto out;
-
-allocated:
-
- ext4_debug("using block group %lu(%d)\n",
- group_no, gdp->bg_free_blocks_count);
-
- BUFFER_TRACE(gdp_bh, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, gdp_bh);
- if (fatal)
- goto out;
-
- ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
- if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
- in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
- in_range(ret_block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
- in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
- ext4_error(sb, "ext4_new_block",
- "Allocating block in system zone - "
- "blocks from %llu, length %lu",
- ret_block, num);
- goto out;
- }
-
- performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
- {
- struct buffer_head *debug_bh;
-
- /* Record bitmap buffer state in the newly allocated block */
- debug_bh = sb_find_get_block(sb, ret_block);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "state when allocated");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
- brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
- spin_lock(sb_bgl_lock(sbi, group_no));
- if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
- int i;
-
- for (i = 0; i < num; i++) {
- if (ext4_test_bit(grp_alloc_blk+i,
- bh2jh(bitmap_bh)->b_committed_data)) {
- printk("%s: block was unexpectedly set in "
- "b_committed_data\n", __FUNCTION__);
- }
- }
- }
- ext4_debug("found bit %d\n", grp_alloc_blk);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- jbd_unlock_bh_state(bitmap_bh);
-#endif
-
- if (ret_block + num - 1 >= ext4_blocks_count(es)) {
- ext4_error(sb, "ext4_new_block",
- "block(%llu) >= blocks count(%llu) - "
- "block_group = %lu, es == %p ", ret_block,
- ext4_blocks_count(es), group_no, es);
- goto out;
- }
-
- /*
- * It is up to the caller to add the new buffer to a journal
- * list of some description. We don't know in advance whether
- * the caller wants to use it as metadata or data.
- */
- spin_lock(sb_bgl_lock(sbi, group_no));
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- gdp->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
- BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext4_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
- sb->s_dirt = 1;
- if (fatal)
- goto out;
-
- *errp = 0;
- brelse(bitmap_bh);
- DQUOT_FREE_BLOCK(inode, *count-num);
- *count = num;
- return ret_block;
-
-io_error:
- *errp = -EIO;
-out:
- if (fatal) {
- *errp = fatal;
- ext4_std_error(sb, fatal);
- }
- /*
- * Undo the block allocation
- */
- if (!performed_allocation)
- DQUOT_FREE_BLOCK(inode, *count);
- brelse(bitmap_bh);
- return 0;
-}
-
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, unsigned int flags,
+ unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
- if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
- }
-
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- return ret;
-}
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
+ ar.len = count ? *count : 1;
+ ar.flags = flags;
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
- }
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
+ if (count)
+ *count = ar.len;
+ /*
+ * Account for the allocated meta blocks. We will never
+ * fail EDQUOT for metdata, but we do account for it.
+ */
+ if (!(*errp) &&
+ ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ dquot_alloc_block_nofail(inode,
+ EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
+ }
return ret;
}
-
/**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
* @sb: superblock
*
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
*/
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
- unsigned long x;
+ unsigned int x;
struct buffer_head *bitmap_bh = NULL;
es = EXT4_SB(sb)->s_es;
@@ -1924,36 +671,43 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
bitmap_count = 0;
gdp = NULL;
- smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
- x = ext4_count_free(bitmap_bh, sb->s_blocksize);
- printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+ printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+ i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
- printk("ext4_count_free_blocks: stored = %llu"
- ", computed = %llu, %llu\n",
- ext4_free_blocks_count(es),
- desc_count, bitmap_count);
+ printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+ ", computed = %llu, %llu\n",
+ EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+ desc_count, bitmap_count);
return bitmap_count;
#else
desc_count = 0;
- smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
@@ -1962,21 +716,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
-}
-
-static int ext4_group_sparse(ext4_group_t group)
-{
- if (group <= 1)
- return 1;
- if (!(group & 1))
- return 0;
- return (test_root(group, 7) || test_root(group, 5) ||
- test_root(group, 3));
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
/**
@@ -1989,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
- !ext4_group_sparse(group))
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (group == 0)
+ return 1;
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+ group == le32_to_cpu(es->s_backup_bgs[1]))
+ return 1;
+ return 0;
+ }
+ if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ return 1;
+ if (!(group & 1))
return 0;
- return 1;
+ if (test_root(group, 3) || (test_root(group, 5)) ||
+ test_root(group, 7))
+ return 1;
+
+ return 0;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
@@ -2011,7 +774,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
@@ -2031,8 +800,82 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
metagroup < first_meta_bg)
- return ext4_bg_num_gdb_nometa(sb,group);
+ return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
}
+
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned num;
+
+ /* Check for superblock and gdt backups in this group */
+ num = ext4_bg_has_super(sb, block_group);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+ sbi->s_desc_per_block) {
+ if (num) {
+ num += ext4_bg_num_gdb(sb, block_group);
+ num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ }
+ } else { /* For META_BG_BLOCK_GROUPS */
+ num += ext4_bg_num_gdb(sb, block_group);
+ }
+ return EXT4_NUM_B2C(sbi, num);
+}
+/**
+ * ext4_inode_to_goal_block - return a hint for block allocation
+ * @inode: inode for block allocation
+ *
+ * Return the ideal location to start allocating blocks for a
+ * newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_group_t block_group;
+ ext4_grpblk_t colour;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+ ext4_fsblk_t bg_start;
+ ext4_fsblk_t last_block;
+
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+ last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
+ if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ else
+ colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ return bg_start + colour;
+}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 420554f8f79..3285aa5a706 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -9,24 +9,94 @@
#include <linux/buffer_head.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4.h"
-#ifdef EXT4FS_DEBUG
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
+{
+ return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
+}
+
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
-static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+ provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+ return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz)
{
- unsigned int i;
- unsigned long sum = 0;
-
- if (!map)
- return (0);
- for (i = 0; i < numchars; i++)
- sum += nibblemap[map->b_data[i] & 0xf] +
- nibblemap[(map->b_data[i] >> 4) & 0xf];
- return (sum);
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+ gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
-#endif /* EXT4FS_DEBUG */
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh)
+{
+ __u32 hi;
+ __u32 provided, calculated;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+ calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+ hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+ provided |= (hi << 16);
+ } else
+ calculated &= 0xFFFF;
+
+ if (provided == calculated)
+ return 1;
+
+ return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh)
+{
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+ __u32 csum;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+ gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 00000000000..41eb9dcfac7
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,243 @@
+/*
+ * linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+ struct rb_node node;
+ ext4_fsblk_t start_blk;
+ unsigned int count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init ext4_init_system_zone(void)
+{
+ ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
+ if (ext4_system_zone_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_system_zone(void)
+{
+ kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+ struct ext4_system_zone *entry2)
+{
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *new_entry = NULL, *entry;
+ struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node = NULL;
+
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_system_zone, node);
+ if (start_blk < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ if (start_blk + count > (entry->start_blk +
+ entry->count))
+ entry->count = (start_blk + count -
+ entry->start_blk);
+ new_node = *n;
+ new_entry = rb_entry(new_node, struct ext4_system_zone,
+ node);
+ break;
+ }
+ }
+
+ if (!new_entry) {
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &sbi->system_blks);
+ }
+
+ /* Can we merge to the left? */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+
+ /* Can we merge to the right? */
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+ return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+ struct rb_node *node;
+ struct ext4_system_zone *entry;
+ int first = 1;
+
+ printk(KERN_INFO "System zones: ");
+ node = rb_first(&sbi->system_blks);
+ while (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ printk("%s%llu-%llu", first ? "" : ", ",
+ entry->start_blk, entry->start_blk + entry->count - 1);
+ first = 0;
+ node = rb_next(node);
+ }
+ printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_group_t i;
+ int flex_size = ext4_flex_bg_size(sbi);
+ int ret;
+
+ if (!test_opt(sb, BLOCK_VALIDITY)) {
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ ext4_release_system_zone(sb);
+ return 0;
+ }
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ return 0;
+
+ for (i=0; i < ngroups; i++) {
+ if (ext4_bg_has_super(sb, i) &&
+ ((i < 5) || ((i % flex_size) == 0)))
+ add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+ ext4_bg_num_gdb(sb, i) + 1);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+ sbi->s_itb_per_group);
+ if (ret)
+ return ret;
+ }
+
+ if (test_opt(sb, DEBUG))
+ debug_print_tree(EXT4_SB(sb));
+ return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+ struct ext4_system_zone *entry, *n;
+
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &EXT4_SB(sb)->system_blks, node)
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+
+ EXT4_SB(sb)->system_blks = RB_ROOT;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *entry;
+ struct rb_node *n = sbi->system_blks.rb_node;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int ext4_check_blockref(const char *function, unsigned int line,
+ struct inode *inode, __le32 *p, unsigned int max)
+{
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ __le32 *bref = p;
+ unsigned int blk;
+
+ while (bref < p+max) {
+ blk = le32_to_cpu(*bref++);
+ if (blk &&
+ unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ blk, 1))) {
+ es->s_last_error_block = cpu_to_le64(blk);
+ ext4_error_inode(inode, function, line, blk,
+ "invalid block");
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2c23bade9aa..ef1bed66c14 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -23,123 +23,135 @@
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
+#include "ext4.h"
+#include "xattr.h"
-static unsigned char ext4_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
- struct file * filp);
-
-const struct file_operations ext4_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .readdir = ext4_readdir, /* we take BKL. needed?*/
- .ioctl = ext4_ioctl, /* BKL held */
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
-#endif
- .fsync = ext4_sync_file,
- .release = ext4_release_dir,
-};
+static int ext4_dx_readdir(struct file *, struct dir_context *);
-
-static unsigned char get_dtype(struct super_block *sb, int filetype)
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
- return DT_UNKNOWN;
+ struct super_block *sb = inode->i_sb;
- return (ext4_filetype_table[filetype]);
-}
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+ ext4_has_inline_data(inode)))
+ return 1;
+ return 0;
+}
-int ext4_check_dir_entry (const char * function, struct inode * dir,
- struct ext4_dir_entry_2 * de,
- struct buffer_head * bh,
- unsigned long offset)
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+ struct inode *dir, struct file *filp,
+ struct ext4_dir_entry_2 *de,
+ struct buffer_head *bh, char *buf, int size,
+ unsigned int offset)
{
- const char * error_msg = NULL;
- const int rlen = ext4_rec_len_from_disk(de->rec_len);
+ const char *error_msg = NULL;
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
- if (rlen < EXT4_DIR_REC_LEN(1))
+ if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
- error_msg = "directory entry across blocks";
- else if (le32_to_cpu(de->inode) >
- le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+ else if (unlikely(((char *) de - buf) + rlen > size))
+ error_msg = "directory entry across range";
+ else if (unlikely(le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else
+ return 0;
- if (error_msg != NULL)
- ext4_error (dir->i_sb, function,
- "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg, offset,
- (unsigned long) le32_to_cpu(de->inode),
- rlen, de->name_len);
- return error_msg == NULL ? 1 : 0;
+ if (filp)
+ ext4_error_file(filp, function, line, bh->b_blocknr,
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ else
+ ext4_error_inode(dir, function, line, bh->b_blocknr,
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+
+ return 1;
}
-static int ext4_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
- unsigned long offset;
- int i, stored;
+ unsigned int offset;
+ int i;
struct ext4_dir_entry_2 *de;
- struct super_block *sb;
int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
- int ret = 0;
-
- sb = inode->i_sb;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ int dir_has_error = 0;
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
- err = ext4_dx_readdir(filp, dirent, filldir);
+ if (is_dx_dir(inode)) {
+ err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
+ return err;
}
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(file_inode(file),
+ EXT4_INODE_INDEX);
}
- stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
- ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
- struct buffer_head map_bh;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ int ret = ext4_read_inline_dir(file, ctx,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
+ offset = ctx->pos & (sb->s_blocksize - 1);
+
+ while (ctx->pos < inode->i_size) {
+ struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_len = 1;
+ err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
- pgoff_t index = map_bh.b_blocknr >>
+ pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
- bh = ext4_bread(NULL, inode, blk, 0, &err);
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
/*
@@ -147,22 +159,38 @@ static int ext4_readdir(struct file * filp,
* of recovering data when there's a bad sector
*/
if (!bh) {
- ext4_error (sb, "ext4_readdir",
- "directory #%lu contains a hole at offset %lu",
- inode->i_ino, (unsigned long)filp->f_pos);
+ if (!dir_has_error) {
+ EXT4_ERROR_FILE(file, 0,
+ "directory contains a "
+ "hole at offset %llu",
+ (unsigned long long) ctx->pos);
+ dir_has_error = 1;
+ }
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
-revalidate:
+ /* Check the checksum */
+ if (!buffer_verified(bh) &&
+ !ext4_dirent_csum_verify(inode,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_FILE(file, 0, "directory fails checksum "
+ "at offset %llu",
+ (unsigned long long)ctx->pos);
+ ctx->pos += sb->s_blocksize - offset;
+ brelse(bh);
+ continue;
+ }
+ set_buffer_verified(bh);
+
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -172,78 +200,136 @@ revalidate:
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len)
- < EXT4_DIR_REC_LEN(1))
+ if (ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
break;
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
- bh, offset)) {
+ if (ext4_check_dir_entry(inode, file, de, bh,
+ bh->b_data, bh->b_size,
+ offset)) {
/*
- * On error, skip the f_pos to the next block
+ * On error, skip to the next block
*/
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
- ret = stored;
- goto out;
+ break;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- filp->f_pos,
le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = 0;
- brelse (bh);
+ brelse(bh);
+ if (ctx->pos < inode->i_size) {
+ if (!dir_relax(inode))
+ return 0;
+ }
}
-out:
- return ret;
+ return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
}
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT4_HTREE_EOF_32BIT;
+ else
+ return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
+ * directories, where the "offset" is in terms of the filename hash
+ * value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond offset limits,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * For non-htree, ext4_llseek already chooses the proper max offset.
+ */
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int dx_dir = is_dx_dir(inode);
+ loff_t htree_max = ext4_get_htree_eof(file);
+
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, whence,
+ htree_max, htree_max);
+ else
+ return ext4_llseek(file, offset, whence);
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -266,59 +352,29 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if ((n)->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
- struct fname * old = fname;
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
+ kfree(old);
}
- if (!parent)
- root->rb_node = NULL;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
- root->rb_node = NULL;
+
+ *root = RB_ROOT;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -336,11 +392,11 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent)
{
struct rb_node **p, *parent = NULL;
- struct fname * fname, *new_fn;
+ struct fname *fname, *new_fn;
struct dir_private_info *info;
int len;
- info = (struct dir_private_info *) dir_file->private_data;
+ info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
@@ -393,73 +449,69 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
- filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
- struct super_block * sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
- printk("call_filldir: called with null fname?!?\n");
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+ "called with null fname?!?", __func__, __LINE__,
+ inode->i_ino, current->comm);
return 0;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name,
+ fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
- info->extra_fname = fname->next;
- return error;
+ get_dtype(sb, fname->file_type))) {
+ info->extra_fname = fname;
+ return 1;
}
fname = fname->next;
}
return 0;
}
-static int ext4_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == EXT4_HTREE_EOF)
+ if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
* If there are any leftover names on the hash collision
* chain, return them first.
*/
- if (info->extra_fname &&
- call_filldir(filp, dirent, filldir, info->extra_fname))
- goto finished;
-
- if (!info->curr_node)
+ if (info->extra_fname) {
+ if (call_filldir(file, ctx, info->extra_fname))
+ goto finished;
+ info->extra_fname = NULL;
+ goto next_node;
+ } else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
while (1) {
@@ -469,17 +521,17 @@ static int ext4_dx_readdir(struct file * filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -488,13 +540,18 @@ static int ext4_dx_readdir(struct file * filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (call_filldir(file, ctx, fname))
break;
-
+ next_node:
info->curr_node = rb_next(info->curr_node);
- if (!info->curr_node) {
+ if (info->curr_node) {
+ fname = rb_entry(info->curr_node, struct fname,
+ rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ } else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT4_HTREE_EOF;
+ ctx->pos = ext4_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -502,14 +559,26 @@ static int ext4_dx_readdir(struct file * filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
return 0;
}
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = ext4_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = ext4_readdir,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file,
+ .release = ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
new file mode 100644
index 00000000000..7cc5a0e2368
--- /dev/null
+++ b/fs/ext4/ext4.h
@@ -0,0 +1,2818 @@
+/*
+ * ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include <linux/jbd2.h>
+#include <linux/quota.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <crypto/hash.h>
+#include <linux/falloc.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
+
+/*
+ * The fourth extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...) \
+ do { \
+ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk(KERN_DEBUG f, ## a); \
+ } while (0)
+#else
+#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...) \
+ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE 0x0001
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED 0x0002
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA 0x0004
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST 0x0008
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST 0x0010
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA 0x0020
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED 0x2000
+
+struct ext4_allocation_request {
+ /* target inode for block we're allocating */
+ struct inode *inode;
+ /* how many blocks we want to allocate */
+ unsigned int len;
+ /* logical block in target inode */
+ ext4_lblk_t logical;
+ /* the closest logical allocated block to the left */
+ ext4_lblk_t lleft;
+ /* the closest logical allocated block to the right */
+ ext4_lblk_t lright;
+ /* phys. target (a hint) */
+ ext4_fsblk_t goal;
+ /* phys. block for the closest logical allocated block to the left */
+ ext4_fsblk_t pleft;
+ /* phys. block for the closest logical allocated block to the right */
+ ext4_fsblk_t pright;
+ /* flags. see above EXT4_MB_HINT_* */
+ unsigned int flags;
+};
+
+/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks(). It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW (1 << BH_New)
+#define EXT4_MAP_MAPPED (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
+#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_FROM_CLUSTER)
+
+struct ext4_map_blocks {
+ ext4_fsblk_t m_pblk;
+ ext4_lblk_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
+};
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN 0x0001
+
+/*
+ * For converting unwritten extents on a work queue. 'handle' is used for
+ * buffered writeback.
+ */
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
+ struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
+ unsigned int flag; /* unwritten or not */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+ atomic_t count; /* reference counter */
+} ext4_io_end_t;
+
+struct ext4_io_submit {
+ int io_op;
+ struct bio *io_bio;
+ ext4_io_end_t *io_end;
+ sector_t io_next_block;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define EXT4_BAD_INO 1 /* Bad blocks inode */
+#define EXT4_ROOT_INO 2 /* Root inode */
+#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
+#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
+#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
+#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO 8 /* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO 11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX 65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE 1024
+#define EXT4_MAX_BLOCK_SIZE 65536
+#define EXT4_MIN_BLOCK_LOG_SIZE 10
+#define EXT4_MAX_BLOCK_LOG_SIZE 16
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
+ EXT4_SB(s)->s_cluster_bits)
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+ EXT4_GOOD_OLD_INODE_SIZE : \
+ (s)->s_inode_size)
+#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+ EXT4_GOOD_OLD_FIRST_INO : \
+ (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
+
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+ (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \
+ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
+ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
+ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \
+ ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+ __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
+ __le32 bg_inode_table_lo; /* Inodes table block */
+ __le16 bg_free_blocks_count_lo;/* Free blocks count */
+ __le16 bg_free_inodes_count_lo;/* Free inodes count */
+ __le16 bg_used_dirs_count_lo; /* Directories count */
+ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
+ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */
+ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
+ __le16 bg_itable_unused_lo; /* Unused inodes count */
+ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
+ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
+ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
+ __le32 bg_inode_table_hi; /* Inodes table block MSB */
+ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
+ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
+ __le16 bg_used_dirs_count_hi; /* Directories count MSB */
+ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
+ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */
+ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+ __u32 bg_reserved;
+};
+
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+ sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \
+ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+ sizeof(__le16))
+
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ atomic64_t free_clusters;
+ atomic_t free_inodes;
+ atomic_t used_dirs;
+};
+
+#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE 32
+#define EXT4_MIN_DESC_SIZE_64BIT 64
+#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
+# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT4_NDIR_BLOCKS 12
+#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS
+#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1)
+#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1)
+#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */
+#define EXT4_UNRM_FL 0x00000002 /* Undelete */
+#define EXT4_COMPR_FL 0x00000004 /* Compress file */
+#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL 0x00000100
+#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT4_REG_FLMASK;
+ else
+ return flags & EXT4_OTHER_FLMASK;
+}
+
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(INLINE_DATA);
+ CHECK_FLAG_VALUE(RESERVED);
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+ __u32 group; /* Group number for this data */
+ __u64 block_bitmap; /* Absolute block number of block bitmap */
+ __u64 inode_bitmap; /* Absolute block number of inode bitmap */
+ __u64 inode_table; /* Absolute block number of inode table start */
+ __u32 blocks_count; /* Total number of blocks in this group */
+ __u16 reserved_blocks; /* Number of reserved blocks in this group */
+ __u16 unused;
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+ __u32 group;
+ __u64 block_bitmap;
+ __u64 inode_bitmap;
+ __u64 inode_table;
+ __u32 blocks_count;
+ __u16 reserved_blocks;
+ __u16 unused;
+ __u32 free_blocks_count;
+};
+
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+ BLOCK_BITMAP = 0, /* block bitmap */
+ INODE_BITMAP, /* inode bitmap */
+ INODE_TABLE, /* inode tables */
+ GROUP_TABLE_COUNT,
+};
+
+/*
+ * Flags used by ext4_map_blocks()
+ */
+ /* Allocate any needed blocks and/or convert an unwritten
+ extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE 0x0001
+ /* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\
+ EXT4_GET_BLOCKS_CREATE)
+ /* Caller is from the delayed allocation writeout path
+ * finally doing the actual allocation of delayed blocks */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
+ /* caller is from the direct IO path, request to creation of an
+ unwritten extents if not allocated, split the unwritten
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_PRE_IO 0x0008
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Eventual metadata allocation (due to growing extent tree)
+ * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
+ /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
+ /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
+ /* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
+ /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
+
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
+
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA 0x0001
+#define EXT4_FREE_BLOCKS_FORGET 0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RESERVE 0x0040
+
+/*
+ * ioctl commands
+ */
+#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
+#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
+#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
+#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
+#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif
+
+/* Max physical block we can address w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size_lo; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Inode Change time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks_lo; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __le32 l_i_version;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl_lo; /* File ACL */
+ __le32 i_size_high;
+ __le32 i_obso_faddr; /* Obsoleted fragment address */
+ union {
+ struct {
+ __le16 l_i_blocks_high; /* were l_i_reserved1 */
+ __le16 l_i_file_acl_high;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+ __le16 l_i_reserved;
+ } linux2;
+ struct {
+ __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
+ __le16 m_i_file_acl_high;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ __le16 i_extra_isize;
+ __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */
+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
+ __le32 i_crtime; /* File Creation time */
+ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+ __le32 i_version_hi; /* high 32 bits for 64-bit version */
+};
+
+struct move_extent {
+ __u32 reserved; /* should be zero */
+ __u32 donor_fd; /* donor file descriptor */
+ __u64 orig_start; /* logical start offset in block for orig */
+ __u64 donor_start; /* logical start offset in block for donor */
+ __u64 len; /* block length to be moved */
+ __u64 moved_len; /* moved block length */
+};
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \
+ ((offsetof(typeof(*ext4_inode), field) + \
+ sizeof((ext4_inode)->field)) \
+ <= (EXT4_GOOD_OLD_INODE_SIZE + \
+ (einode)->i_extra_isize)) \
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+ if (sizeof(time->tv_sec) > 4)
+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+ << 32;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+do { \
+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
+ (raw_inode)->xtime ## _extra = \
+ ext4_encode_extra_time(&(inode)->xtime); \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
+ (raw_inode)->xtime ## _extra = \
+ ext4_encode_extra_time(&(einode)->xtime); \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+do { \
+ (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
+ ext4_decode_extra_time(&(inode)->xtime, \
+ raw_inode->xtime ## _extra); \
+ else \
+ (inode)->xtime.tv_nsec = 0; \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime.tv_sec = \
+ (signed)le32_to_cpu((raw_inode)->xtime); \
+ else \
+ (einode)->xtime.tv_sec = 0; \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
+ ext4_decode_extra_time(&(einode)->xtime, \
+ raw_inode->xtime ## _extra); \
+ else \
+ (einode)->xtime.tv_nsec = 0; \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_file_acl_high osd2.linux2.l_i_file_acl_high
+#define i_blocks_high osd2.linux2.l_i_blocks_high
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_checksum_lo osd2.linux2.l_i_checksum_lo
+
+#elif defined(__GNU__)
+
+#define i_translator osd1.hurd1.h_i_translator
+#define i_uid_high osd2.hurd2.h_i_uid_high
+#define i_gid_high osd2.hurd2.h_i_gid_high
+#define i_author osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1 osd1.masix1.m_i_reserved1
+#define i_file_acl_high osd2.masix2.m_i_file_acl_high
+#define i_reserved2 osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+#include "extents_status.h"
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ ext4_group_t i_block_group;
+ ext4_lblk_t i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
+ unsigned long i_state_flags; /* Dynamic state flags */
+#endif
+ unsigned long i_flags;
+
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext4_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /*
+ * i_data_sem is for serialising ext4_truncate() against
+ * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext4 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have i_data_sem.
+ */
+ struct rw_semaphore i_data_sem;
+ struct inode vfs_inode;
+ struct jbd2_inode *jinode;
+
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
+
+ /*
+ * File creation time. Its function is same as that of
+ * struct timespec i_{a,c,m}time in the generic inode.
+ */
+ struct timespec i_crtime;
+
+ /* mballoc */
+ struct list_head i_prealloc_list;
+ spinlock_t i_prealloc_lock;
+
+ /* extents status tree */
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+ struct list_head i_es_lru;
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
+ /* allocation reservation info for delalloc */
+ /* In case of bigalloc, these refer to clusters rather than blocks */
+ unsigned int i_reserved_data_blocks;
+ unsigned int i_reserved_meta_blocks;
+ unsigned int i_allocated_meta_blocks;
+ ext4_lblk_t i_da_metadata_calc_last_lblock;
+ int i_da_metadata_calc_len;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+ /* Indicate the inline data space. */
+ u16 i_inline_off;
+ u16 i_inline_size;
+
+#ifdef CONFIG_QUOTA
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
+
+ /* Lock protecting lists below */
+ spinlock_t i_completed_io_lock;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
+ atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ struct work_struct i_rsv_conversion_work;
+
+ spinlock_t i_block_reservation_lock;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
+
+ /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+ __u32 i_csum_seed;
+};
+
+/*
+ * File system states
+ */
+#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT4_ERROR_FS 0x0002 /* Errors detected */
+#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
+
+/*
+ * Mount flags set via mount options or defaults
+ */
+#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK 0x00070
+#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
+#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
+#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
+ specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
+ size of blocksize * 8
+ blocks */
+#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
+ file systems */
+
+#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
+ ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
+ EXT4_MOUNT_##opt
+#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
+ EXT4_MOUNT_##opt)
+
+#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
+ ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
+ EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
+ EXT4_MOUNT2_##opt)
+
+#define ext4_test_and_set_bit __test_and_set_bit_le
+#define ext4_set_bit __set_bit_le
+#define ext4_set_bit_atomic ext2_set_bit_atomic
+#define ext4_test_and_clear_bit __test_and_clear_bit_le
+#define ext4_clear_bit __clear_bit_le
+#define ext4_clear_bit_atomic ext2_clear_bit_atomic
+#define ext4_test_bit test_bit_le
+#define ext4_find_next_zero_bit find_next_zero_bit_le
+#define ext4_find_next_bit find_next_bit_le
+
+extern void ext4_set_bits(void *bm, int cur, int len);
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT4_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT4_ERRORS_PANIC 3 /* Panic */
+#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
+
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM 1
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count_lo; /* Blocks count */
+ __le32 s_r_blocks_count_lo; /* Reserved blocks count */
+ __le32 s_free_blocks_count_lo; /* Free blocks count */
+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_cluster_size; /* Allocation cluster size */
+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_clusters_per_group; /* # Clusters per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+/*30*/ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+/*40*/ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT4_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+/*78*/ char s_volume_name[16]; /* volume name */
+/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
+ /*
+ * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
+ __le32 s_journal_dev; /* device number of journal file */
+ __le32 s_last_orphan; /* start of list of inodes to delete */
+ __le32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_jnl_backup_type;
+ __le16 s_desc_size; /* size of group descriptor */
+/*100*/ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __le32 s_mkfs_time; /* When the filesystem was created */
+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
+ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
+ __le32 s_r_blocks_count_hi; /* Reserved blocks count */
+ __le32 s_free_blocks_count_hi; /* Free blocks count */
+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_checksum_type; /* metadata checksum algorithm used */
+ __le16 s_reserved_pad;
+ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
+ __le32 s_snapshot_inum; /* Inode number of active snapshot */
+ __le32 s_snapshot_id; /* sequential ID of active snapshot */
+ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active
+ snapshot's future use */
+ __le32 s_snapshot_list; /* inode number of the head of the
+ on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+ __le32 s_error_count; /* number of fs errors */
+ __le32 s_first_error_time; /* first time an error happened */
+ __le32 s_first_error_ino; /* inode involved in first error */
+ __le64 s_first_error_block; /* block involved of first error */
+ __u8 s_first_error_func[32]; /* function where the error happened */
+ __le32 s_first_error_line; /* line number where error happened */
+ __le32 s_last_error_time; /* most recent time of an error */
+ __le32 s_last_error_ino; /* inode involved in last error */
+ __le32 s_last_error_line; /* line number where error happened */
+ __le64 s_last_error_block; /* block involved of last error */
+ __u8 s_last_error_func[32]; /* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+ __u8 s_mount_opts[64];
+ __le32 s_usr_quota_inum; /* inode for tracking user quota */
+ __le32 s_grp_quota_inum; /* inode for tracking group quota */
+ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
+ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
+ __le32 s_reserved[106]; /* Padding to the end of the block */
+ __le32 s_checksum; /* crc32c(superblock) */
+};
+
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
+#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED 0x0001
+#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+ unsigned long s_desc_size; /* Size of a group descriptor in bytes */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_clusters_per_group; /* Number of clusters in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+ unsigned long s_overhead; /* # of fs overhead clusters */
+ unsigned int s_cluster_ratio; /* Number of blocks per cluster */
+ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
+ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head **s_group_desc;
+ unsigned int s_mount_opt;
+ unsigned int s_mount_opt2;
+ unsigned int s_mount_flags;
+ unsigned int s_def_mount_opt;
+ ext4_fsblk_t s_sb_block;
+ atomic64_t s_resv_clusters;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_inode_goal;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeclusters_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyclusters_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+ struct proc_dir_entry *s_proc;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
+ struct super_block *s_sb;
+
+ /* Journaling */
+ struct journal_s *s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ unsigned long s_resize_flags; /* Flags indicating if there
+ is a resizer */
+ unsigned long s_commit_interval;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+ struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+ /* ext4 extents stats */
+ unsigned long s_ext_min;
+ unsigned long s_ext_max;
+ unsigned long s_depth_max;
+ spinlock_t s_ext_stats_lock;
+ unsigned long s_ext_blocks;
+ unsigned long s_ext_extents;
+#endif
+
+ /* for buddy allocator */
+ struct ext4_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ spinlock_t s_md_lock;
+ unsigned short *s_mb_offsets;
+ unsigned int *s_mb_maxs;
+ unsigned int s_group_info_size;
+
+ /* tunables */
+ unsigned long s_stripe;
+ unsigned int s_mb_stream_request;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
+ unsigned int s_max_dir_size_kb;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+ unsigned long s_mb_last_start;
+
+ /* stats for buddy allocator */
+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
+ atomic_t s_bal_success; /* we found long enough chunks */
+ atomic_t s_bal_allocated; /* in blocks */
+ atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_breaks; /* too long searches */
+ atomic_t s_bal_2orders; /* 2^order hits */
+ spinlock_t s_bal_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
+ atomic_t s_mb_lost_chunks;
+ atomic_t s_mb_preallocated;
+ atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
+
+ /* locality groups */
+ struct ext4_locality_group __percpu *s_locality_groups;
+
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
+ /* the size of zero-out chunk */
+ unsigned int s_extent_max_zeroout_kb;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
+ ext4_group_t s_flex_groups_allocated;
+
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
+
+ /* timer for periodic error stats printing */
+ struct timer_list s_err_report;
+
+ /* Lazy inode table initialization info */
+ struct ext4_li_request *s_li_request;
+ /* Wait multiplier for lazy initialization thread */
+ unsigned int s_li_wait_mult;
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
+
+ /* record the last minlen when FITRIM is called. */
+ atomic_t s_last_trim_minblks;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
+
+ /* Precomputed FS UUID checksum for seeding other checksums */
+ __u32 s_csum_seed;
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
+ struct percpu_counter s_extent_cache_cnt;
+ struct mb_cache *s_mb_cache;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+ struct ratelimit_state s_err_ratelimit_state;
+ struct ratelimit_state s_warning_ratelimit_state;
+ struct ratelimit_state s_msg_ratelimit_state;
+};
+
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+ return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+ return ino == EXT4_ROOT_INO ||
+ ino == EXT4_USR_QUOTA_INO ||
+ ino == EXT4_GRP_QUOTA_INO ||
+ ino == EXT4_BOOT_LOADER_INO ||
+ ino == EXT4_JOURNAL_INO ||
+ ino == EXT4_RESIZE_INO ||
+ (ino >= EXT4_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
+ }
+}
+
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT4_STATE_JDATA, /* journaled data exists */
+ EXT4_STATE_NEW, /* inode is newly created */
+ EXT4_STATE_XATTR, /* has in-inode xattrs */
+ EXT4_STATE_NO_EXPAND, /* No space for expansion */
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
+ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
+ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
+ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field, offset) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+}
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ /* We depend on the fact that callers will set i_flags */
+}
+#endif
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb) (sb)
+#endif
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX 0
+#define EXT4_OS_HURD 1
+#define EXT4_OS_MASIX 2
+#define EXT4_OS_FREEBSD 3
+#define EXT4_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
+#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums. However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
+
+#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG| \
+ EXT4_FEATURE_INCOMPAT_EXTENTS| \
+ EXT4_FEATURE_INCOMPAT_64BIT| \
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
+ EXT4_FEATURE_RO_COMPAT_QUOTA)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT4_DEF_RESUID 0
+#define EXT4_DEF_RESGID 0
+
+#define EXT4_DEF_INODE_READAHEAD_BLKS 32
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG 0x0001
+#define EXT4_DEFM_BSDGROUPS 0x0002
+#define EXT4_DEFM_XATTR_USER 0x0004
+#define EXT4_DEFM_ACL 0x0008
+#define EXT4_DEFM_UID16 0x0010
+#define EXT4_DEFM_JMODE 0x0060
+#define EXT4_DEFM_JMODE_DATA 0x0020
+#define EXT4_DEFM_JMODE_ORDERED 0x0040
+#define EXT4_DEFM_JMODE_WBACK 0x0060
+#define EXT4_DEFM_NOBARRIER 0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD 0x0400
+#define EXT4_DEFM_NODELALLOC 0x0800
+
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME 0
+#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[EXT4_NAME_LEN]; /* File name */
+};
+
+/*
+ * The new version of the directory entry. Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT4_NAME_LEN]; /* File name */
+};
+
+/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+ __le32 det_reserved_zero1; /* Pretend to be unused */
+ __le16 det_rec_len; /* 12 */
+ __u8 det_reserved_zero2; /* Zero name length */
+ __u8 det_reserved_ft; /* 0xDE, fake file type */
+ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+ ((blocksize) - \
+ sizeof(struct ext4_dir_entry_tail))))
+
+/*
+ * Ext4 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN 0
+#define EXT4_FT_REG_FILE 1
+#define EXT4_FT_DIR 2
+#define EXT4_FT_CHRDEV 3
+#define EXT4_FT_BLKDEV 4
+#define EXT4_FT_FIFO 5
+#define EXT4_FT_SOCK 6
+#define EXT4_FT_SYMLINK 7
+
+#define EXT4_FT_MAX 8
+
+#define EXT4_FT_DIR_CSUM 0xDE
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD 4
+#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
+ ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN ((1<<16)-1)
+
+/*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == EXT4_MAX_REC_LEN || len == 0)
+ return blocksize;
+ return (len & 65532) | ((len & 3) << 16);
+#else
+ return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+ if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+ BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len < 65536)
+ return cpu_to_le16(len);
+ if (len == blocksize) {
+ if (blocksize == 65536)
+ return cpu_to_le16(EXT4_MAX_REC_LEN);
+ else
+ return cpu_to_le16(0);
+ }
+ return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+ return cpu_to_le16(len);
+#endif
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY 0
+#define DX_HASH_HALF_MD4 1
+#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
+
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[4];
+ } desc;
+ int err;
+
+ BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
+
+ desc.shash.tfm = sbi->s_chksum_driver;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+
+ return *(u32 *)desc.ctx;
+}
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+ u32 hash;
+ u32 minor_hash;
+ int hash_version;
+ u32 *seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS 1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+ return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories. It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+ struct rb_root root;
+ struct rb_node *curr_node;
+ struct fname *extra_fname;
+ loff_t last_pos;
+ __u32 curr_hash;
+ __u32 curr_minor_hash;
+ __u32 next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+ return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR -75000
+
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT 10
+#define EXT4_DEF_LI_MAX_START_DELAY 5
+#define EXT4_LAZYINIT_QUIT 0x0001
+#define EXT4_LAZYINIT_RUNNING 0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+ unsigned long li_state;
+ struct list_head li_request_list;
+ struct mutex li_list_mtx;
+};
+
+struct ext4_li_request {
+ struct super_block *lr_super;
+ struct ext4_sb_info *lr_sbi;
+ ext4_group_t lr_next_group;
+ struct list_head lr_request;
+ unsigned long lr_next_sched;
+ unsigned long lr_timeout;
+};
+
+struct ext4_features {
+ struct kobject f_kobj;
+ struct completion f_kobj_unregister;
+};
+
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+ __le32 mmp_magic; /* Magic number for MMP */
+ __le32 mmp_seq; /* Sequence no. updated periodically */
+
+ /*
+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+ * purposes and do not affect the correctness of the algorithm
+ */
+ __le64 mmp_time; /* Time last updated */
+ char mmp_nodename[64]; /* Node which last updated MMP block */
+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
+
+ /*
+ * mmp_check_interval is used to verify if the MMP block has been
+ * updated on the block device. The value is updated based on the
+ * maximum time to write the MMP block during an update cycle.
+ */
+ __le16 mmp_check_interval;
+
+ __le16 mmp_pad1;
+ __le32 mmp_pad2[226];
+ __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
+ struct super_block *sb; /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT 2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE /**/
+# define ATTRIB_NORET __attribute__((noreturn))
+# define NORET_AND noreturn,
+
+/* bitmap.c */
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *bh);
+
+/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp,
+ ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+ ext4_fsblk_t block);
+
+extern unsigned int ext4_block_group(struct super_block *sb,
+ ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+ ext4_group_t group);
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal,
+ unsigned int flags,
+ unsigned long *count,
+ int *errp);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+ ext4_group_t block_group,
+ struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+ ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+
+/* dir.c */
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+ struct file *,
+ struct ext4_dir_entry_2 *,
+ struct buffer_head *, char *, int,
+ unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
+ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+ (de), (bh), (buf), (size), (offset)))
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return ext4_filetype_table[filetype];
+}
+
+/* fsync.c */
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+ dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+ 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+ type, nblocks) \
+ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ (type), __LINE__, (nblocks))
+
+
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+ ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+ struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_discard_preallocations(struct inode *);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+ ext4_group_t ngroups);
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+/* inode.c */
+struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+ ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *,
+ ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA 2
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern int ext4_write_inode(struct inode *, struct writeback_control *);
+extern int ext4_setattr(struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
+extern int ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim);
+
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop);
+
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
+
+/* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+ struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern int ext4_calculate_overhead(struct super_block *sb);
+extern void ext4_superblock_csum_set(struct super_block *sb);
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+ ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16]);
+
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern void __ext4_std_error(struct super_block *, const char *,
+ unsigned int, int);
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+extern __printf(3, 4)
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+ const char *, unsigned int, const char *);
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+ struct super_block *, ext4_group_t,
+ unsigned long, ext4_fsblk_t,
+ const char *, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+ __u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+ struct super_block *sb, __u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+ struct super_block *sb, __u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg,
+ __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
+ struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+ struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+ return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
+
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+ else
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+ raw_inode->i_size_lo = cpu_to_le32(i_size);
+ raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info ***grp_info;
+ long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
+ grp_info = EXT4_SB(sb)->s_group_info;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ return grp_info[indexv][indexh];
+}
+
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards. See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+
+ smp_rmb();
+ return ngroups;
+}
+
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
+#define ext4_std_error(sb, errno) \
+do { \
+ if ((errno)) \
+ __ext4_std_error((sb), __func__, __LINE__, (errno)); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
+ * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#else
+#define EXT4_FREECLUSTERS_WATERMARK 0
+#endif
+
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+ !mutex_is_locked(&inode->i_mutex));
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+struct ext4_group_info {
+ unsigned long bb_state;
+ struct rb_root bb_free_root;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ struct list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+#endif
+ struct rw_semaphore alloc_sem;
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
+
+#define EXT4_MB_GRP_NEED_INIT(grp) \
+ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
+ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp) \
+ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
+ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+ ext4_group_t group)
+{
+ return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
+
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+ ext4_group_t group)
+{
+ spin_unlock(ext4_group_lock_ptr(sb, group));
+}
+
+/*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_ind_check_inode(inode) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ struct dir_context *ctx,
+ int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+ struct inode *, __le32 *, unsigned int);
+
+/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+ struct ext4_ext_path *,
+ struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *,
+ int flags);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
+/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+ struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode);
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 start_orig, __u64 start_donor,
+ __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent);
+
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+ struct page *page,
+ int len,
+ struct writeback_control *wbc,
+ bool keep_towrite);
+
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
+enum ext4_state_bits {
+ BH_AllocFromCluster /* allocated blocks were part of already
+ * allocated cluster. */
+ = BH_JBDPrivateStart
+};
+
+/*
+ * Add new method to test whether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+ return (buffer_uptodate(bh) &&
+ test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+ set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ 37
+#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#define EXT4_RESIZING 0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
+#endif /* __KERNEL__ */
+
+#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
new file mode 100644
index 00000000000..a867f5ca999
--- /dev/null
+++ b/fs/ext4/ext4_extents.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
+ */
+
+/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long. It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes. Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+ __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */
+};
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+ __le32 ee_block; /* first logical block extent covers */
+ __le16 ee_len; /* number of blocks covered by extent */
+ __le16 ee_start_hi; /* high 16 bits of physical block */
+ __le32 ee_start_lo; /* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+ __le32 ei_block; /* index covers logical blocks from 'block' */
+ __le32 ei_leaf_lo; /* pointer to the physical block of the next *
+ * level. leaf or next index could be there */
+ __le16 ei_leaf_hi; /* high 16 bits of physical block */
+ __u16 ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+ __le16 eh_magic; /* probably will support different formats */
+ __le16 eh_entries; /* number of valid entries */
+ __le16 eh_max; /* capacity of store in entries */
+ __le16 eh_depth; /* has tree real underlying blocks? */
+ __le32 eh_generation; /* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
+
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+ (sizeof(struct ext4_extent_header) + \
+ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+ return (struct ext4_extent_tail *)(((void *)eh) +
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+ ext4_fsblk_t p_block;
+ __u16 p_depth;
+ struct ext4_extent *p_ext;
+ struct ext4_extent_idx *p_idx;
+ struct ext4_extent_header *p_hdr;
+ struct buffer_head *p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an unwritten (i.e.
+ * preallocated).
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN (1UL << 15)
+#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+ ((struct ext4_extent *) (((char *) (__hdr__)) + \
+ sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \
+ sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+ (le16_to_cpu((__path__)->p_hdr->eh_entries) \
+ < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+ return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+ return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+ return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
+{
+ /* We can not have an unwritten extent of zero length! */
+ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+ ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
+{
+ /* Extent with ee_len of 0x8000 is treated as an initialized extent */
+ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+ return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+ le16_to_cpu(ext->ee_len) :
+ (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ex->ee_start_lo);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ix->ei_leaf_lo);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+ ext4_fsblk_t pb)
+{
+ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+ 0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+ ext4_fsblk_t pb)
+{
+ ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+ 0xffff);
+}
+
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path);
+
+#endif /* _EXT4_EXTENTS */
+
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d6afe4e2734..0074e0d23d6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -2,58 +2,320 @@
* Interface between ext4 and JBD
*/
-#include <linux/ext4_jbd2.h>
+#include "ext4_jbd2.h"
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
- struct buffer_head *bh)
+#include <trace/events/ext4.h>
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
{
- int err = jbd2_journal_get_undo_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
- return err;
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
}
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
- struct buffer_head *bh)
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
{
- int err = jbd2_journal_get_write_access(handle, bh);
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+static int ext4_journal_check_start(struct super_block *sb)
+{
+ journal_t *journal;
+
+ might_sleep();
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+ journal = EXT4_SB(sb)->s_journal;
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (journal && is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return -EROFS;
+ }
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+ ext4_put_nojournal(handle);
+ return 0;
+ }
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ __ext4_std_error(sb, where, line, err);
return err;
}
-int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh)
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
{
- int err = jbd2_journal_forget(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn,
+ struct buffer_head *bh,
+ handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+ caller, line, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh)
+{
+ int err = 0;
+
+ might_sleep();
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_write_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
+ }
return err;
}
-int __ext4_journal_revoke(const char *where, handle_t *handle,
- ext4_fsblk_t blocknr, struct buffer_head *bh)
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+ int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr)
{
- int err = jbd2_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ int err;
+
+ might_sleep();
+
+ trace_ext4_forget(inode, is_metadata, blocknr);
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %x\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* In the no journal case, we can just do a bforget and return */
+ if (!ext4_handle_valid(handle)) {
+ bforget(bh);
+ return 0;
+ }
+
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
+
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext4_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call jbd2_journal_forget");
+ err = jbd2_journal_forget(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ return err;
+ }
+ return 0;
+ }
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+ err = jbd2_journal_revoke(handle, blocknr, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ __ext4_abort(inode->i_sb, where, line,
+ "error %d when attempting revoke", err);
+ }
+ BUFFER_TRACE(bh, "exit");
return err;
}
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
- int err = jbd2_journal_get_create_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ }
return err;
}
-int __ext4_journal_dirty_metadata(const char *where,
- handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
- int err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ int err = 0;
+
+ might_sleep();
+
+ set_buffer_meta(bh);
+ set_buffer_prio(bh);
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_dirty_metadata(handle, bh);
+ /* Errors can only happen if there is a bug */
+ if (WARN_ON_ONCE(err)) {
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
+ if (inode == NULL) {
+ pr_err("EXT4: jbd2_journal_dirty_metadata "
+ "failed: handle type %u started at "
+ "line %u, credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ return err;
+ }
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "journal_dirty_metadata failed: "
+ "handle type %u started at line %u, "
+ "credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ }
+ } else {
+ if (inode)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
+ if (inode && inode_needs_sync(inode)) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ struct ext4_super_block *es;
+
+ es = EXT4_SB(inode->i_sb)->s_es;
+ es->s_last_error_block =
+ cpu_to_le64(bh->b_blocknr);
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "IO error syncing itable block");
+ err = -EIO;
+ }
+ }
+ }
+ return err;
+}
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+ handle_t *handle, struct super_block *sb)
+{
+ struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+ int err = 0;
+
+ ext4_superblock_csum_set(sb);
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_dirty_metadata(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ } else
+ mark_buffer_dirty(bh);
return err;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
new file mode 100644
index 00000000000..17c00ff202f
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,450 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ ? 20U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS 6U
+
+/* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+ EXT4_XATTR_TRANS_BLOCKS - 2 + \
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA 64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS 12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ 1 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * so we need to reserve credits for this eventuality
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+ return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC 0
+#define EXT4_HT_INODE 1
+#define EXT4_HT_WRITE_PAGE 2
+#define EXT4_HT_MAP_BLOCKS 3
+#define EXT4_HT_DIR 4
+#define EXT4_HT_TRUNCATE 5
+#define EXT4_HT_QUOTA 6
+#define EXT4_HT_RESIZE 7
+#define EXT4_HT_MIGRATE 8
+#define EXT4_HT_MOVE_EXTENTS 9
+#define EXT4_HT_XATTR 10
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
+
+/**
+ * struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ * This struct is a 'seed' structure for a using with your own callback
+ * structs. If you are using callbacks you must allocate one of these
+ * or another struct of your own definition which has this struct
+ * as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+ /* list information for other callbacks attached to the same handle */
+ struct list_head jce_list;
+
+ /* Function to call with this callback structure */
+ void (*jce_func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int error);
+
+ /* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ * @sb: superblock of current filesystem for transaction
+ * @jce: returned journal callback data
+ * @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+ void (*func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc),
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ /* Add the jce to transaction's private list */
+ jce->jce_func = func;
+ spin_lock(&sbi->s_md_lock);
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ * Return true if object was successfully removed
+ */
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ bool deleted;
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ spin_lock(&sbi->s_md_lock);
+ deleted = !list_empty(&jce->jce_list);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ return deleted;
+}
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.
+ */
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+ int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr);
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+ handle_t *handle, struct super_block *sb);
+
+#define ext4_journal_get_write_access(handle, bh) \
+ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+ (bh), (block_nr))
+#define ext4_journal_get_create_access(handle, bh) \
+ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+ (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
+
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
+static inline int ext4_handle_valid(handle_t *handle)
+{
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+ return 0;
+ return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ handle->h_sync = 1;
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ return is_handle_aborted(handle);
+ return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+ if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+ return 0;
+ return 1;
+}
+
+#define ext4_journal_start_sb(sb, type, nblocks) \
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start(inode, type, nblocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+ unsigned int line, int type,
+ int blocks, int rsv_blocks)
+{
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
+}
+
+#define ext4_journal_stop(handle) \
+ __ext4_journal_stop(__func__, __LINE__, (handle))
+
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+ return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_extend(handle, nblocks);
+ return 0;
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_restart(handle, nblocks);
+ return 0;
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) != NULL)
+ return jbd2_journal_blocks_per_page(inode);
+ return 0;
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+ if (journal)
+ return jbd2_journal_force_commit(journal);
+ return 0;
+}
+
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+ return 0;
+}
+
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ else
+ BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
+}
+
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads. This only works for extent-based
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+ if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ return 1;
+}
+
+#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9ae6e67090c..4da228a0e6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,80 +29,108 @@
* - smart tree reduction
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
-#include <linux/ext4_jbd2.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/falloc.h>
-#include <linux/ext4_fs_extents.h>
#include <asm/uaccess.h>
+#include <linux/fiemap.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
+#include <trace/events/ext4.h>
/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
+ * used by extent splitting.
*/
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
- ext4_fsblk_t block;
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
- block = le32_to_cpu(ex->ee_start_lo);
- block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
- return block;
-}
+#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+static __le32 ext4_extent_block_csum(struct inode *inode,
+ struct ext4_extent_header *eh)
{
- ext4_fsblk_t block;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
- block = le32_to_cpu(ix->ei_leaf_lo);
- block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
- return block;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ EXT4_EXTENT_TAIL_OFFSET(eh));
+ return cpu_to_le32(csum);
}
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+static int ext4_extent_block_csum_verify(struct inode *inode,
+ struct ext4_extent_header *eh)
{
- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ et = find_ext4_extent_tail(eh);
+ if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+ return 0;
+ return 1;
}
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+static void ext4_extent_block_csum_set(struct inode *inode,
+ struct ext4_extent_header *eh)
{
- ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
- ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+ struct ext4_extent_tail *et;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ et = find_ext4_extent_tail(eh);
+ et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags);
+
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags);
+
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct extent_status *newes);
+
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
+ if (!ext4_handle_valid(handle))
+ return 0;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err <= 0)
+ return err;
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ if (err == 0)
+ err = -EAGAIN;
- return handle;
+ return err;
}
/*
@@ -115,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
{
if (path->p_bh) {
/* path points to block */
+ BUFFER_TRACE(path->p_bh, "get_write_access");
return ext4_journal_get_write_access(handle, path->p_bh);
}
/* path points to leaf/index in inode body */
@@ -128,13 +157,15 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+ struct inode *inode, struct ext4_ext_path *path)
{
int err;
if (path->p_bh) {
+ ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
/* path points to block */
- err = ext4_journal_dirty_metadata(handle, path->p_bh);
+ err = __ext4_handle_dirty_metadata(where, line, handle,
+ inode, path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -146,20 +177,37 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- int depth;
-
if (path) {
+ int depth = path->p_depth;
struct ext4_extent *ex;
- depth = path->p_depth;
- /* try to predict block placement */
+ /*
+ * Try to predict block placement assuming that we are
+ * filling in a file which will eventually be
+ * non-sparse --- i.e., in the case of libbfd writing
+ * an ELF object sections out-of-order but in a way
+ * the eventually results in a contiguous object or
+ * executable file, or some database extending a table
+ * space file. However, this is actually somewhat
+ * non-ideal if we are writing a sparse file such as
+ * qemu or KVM writing a raw image file that is going
+ * to stay fairly sparse, since it will end up
+ * fragmenting the file system's free space. Maybe we
+ * should have some hueristics or some way to allow
+ * userspace to pass a hint to file system,
+ * especially if the latter case turns out to be
+ * common.
+ */
ex = path[depth].p_ext;
- if (ex)
- return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+ if (ex) {
+ ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+ ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+ if (block > ext_block)
+ return ext_pblk + (block - ext_block);
+ else
+ return ext_pblk - (ext_block - block);
+ }
/* it looks like index is empty;
* try to find starting block from index itself */
@@ -168,57 +216,52 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}
/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour + block;
+ return ext4_inode_to_goal_block(inode);
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex, int *err)
+ struct ext4_extent *ex, int *err, unsigned int flags)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, err);
return newblock;
}
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
- if (size > 6)
+ if (!check && size > 6)
size = 6;
#endif
return size;
}
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
- if (size > 5)
+ if (!check && size > 5)
size = 5;
#endif
return size;
}
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
@@ -226,13 +269,13 @@ static int ext4_ext_space_root(struct inode *inode)
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
- if (size > 3)
+ if (!check && size > 3)
size = 3;
#endif
return size;
}
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
@@ -240,12 +283,59 @@ static int ext4_ext_space_root_idx(struct inode *inode)
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
- if (size > 4)
+ if (!check && size > 4)
size = 4;
#endif
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int idxs;
+
+ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx));
+
+ /*
+ * If the new delayed allocation block is contiguous with the
+ * previous da block, it can share index blocks with the
+ * previous block, so we only need to allocate a new index
+ * block every idxs leaf blocks. At ldxs**2 blocks, we need
+ * an additional index block, and at ldxs**3 blocks, yet
+ * another index blocks.
+ */
+ if (ei->i_da_metadata_calc_len &&
+ ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ int num = 0;
+
+ if ((ei->i_da_metadata_calc_len % idxs) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+ num++;
+ ei->i_da_metadata_calc_len = 0;
+ } else
+ ei->i_da_metadata_calc_len++;
+ ei->i_da_metadata_calc_last_lblock++;
+ return num;
+ }
+
+ /*
+ * In the worst case we need a new set of index blocks at
+ * every level of the inode's extent tree.
+ */
+ ei->i_da_metadata_calc_len = 1;
+ ei->i_da_metadata_calc_last_lblock = lblock;
+ return ext_depth(inode) + 1;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -253,22 +343,88 @@ ext4_ext_max_entries(struct inode *inode, int depth)
if (depth == ext_depth(inode)) {
if (depth == 0)
- max = ext4_ext_space_root(inode);
+ max = ext4_ext_space_root(inode, 1);
else
- max = ext4_ext_space_root_idx(inode);
+ max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
- max = ext4_ext_space_block(inode);
+ max = ext4_ext_space_block(inode, 1);
else
- max = ext4_ext_space_block_idx(inode);
+ max = ext4_ext_space_block_idx(inode, 1);
}
return max;
}
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
- struct ext4_extent_header *eh,
- int depth)
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+ ext4_fsblk_t block = ext4_ext_pblock(ext);
+ int len = ext4_ext_get_actual_len(ext);
+ ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+ ext4_lblk_t last = lblock + len - 1;
+
+ if (lblock > last)
+ return 0;
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+ struct ext4_extent_idx *ext_idx)
+{
+ ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
+
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth)
+{
+ unsigned short entries;
+ if (eh->eh_entries == 0)
+ return 1;
+
+ entries = le16_to_cpu(eh->eh_entries);
+
+ if (depth == 0) {
+ /* leaf entries */
+ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ ext4_fsblk_t pblock = 0;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+ int len = 0;
+ while (entries) {
+ if (!ext4_valid_extent(inode, ext))
+ return 0;
+
+ /* Check for overlapping extents */
+ lblock = le32_to_cpu(ext->ee_block);
+ len = ext4_ext_get_actual_len(ext);
+ if ((lblock <= prev) && prev) {
+ pblock = ext4_ext_pblock(ext);
+ es->s_last_error_block = cpu_to_le64(pblock);
+ return 0;
+ }
+ ext++;
+ entries--;
+ prev = lblock + len - 1;
+ }
+ } else {
+ struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+ while (entries) {
+ if (!ext4_valid_extent_idx(inode, ext_idx))
+ return 0;
+ ext_idx++;
+ entries--;
+ }
+ }
+ return 1;
+}
+
+static int __ext4_ext_check(const char *function, unsigned int line,
+ struct inode *inode, struct ext4_extent_header *eh,
+ int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0;
@@ -294,21 +450,163 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ error_msg = "invalid extent entries";
+ goto corrupted;
+ }
+ /* Verify checksum on non-root extent tree nodes */
+ if (ext_depth(inode) != depth &&
+ !ext4_extent_block_csum_verify(inode, eh)) {
+ error_msg = "extent tree corrupted";
+ goto corrupted;
+ }
return 0;
corrupted:
- ext4_error(inode->i_sb, function,
- "bad header in inode #%lu: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
-
+ ext4_error_inode(inode, function, line, 0,
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return -EIO;
}
-#define ext4_ext_check_header(inode, eh, depth) \
- __ext4_ext_check_header(__FUNCTION__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk) \
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
+}
+
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+ struct inode *inode, ext4_fsblk_t pblk, int depth,
+ int flags)
+{
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(inode->i_sb, pblk);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+ err = bh_submit_read(bh);
+ if (err < 0)
+ goto errout;
+ }
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+ return bh;
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
+ set_buffer_verified(bh);
+ /*
+ * If this is a leaf block, cache all of its entries
+ */
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+ struct ext4_extent_header *eh = ext_block_hdr(bh);
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev,
+ lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+ }
+ return bh;
+errout:
+ put_bh(bh);
+ return ERR_PTR(err);
+
+}
+
+#define read_extent_tree_block(inode, pblk, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+ (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ up_read(&ei->i_data_sem);
+ return -ENOMEM;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -319,12 +617,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
- idx_pblock(path->p_idx));
+ ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:%d:%llu ",
+ ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
- ext_pblock(path->p_ext));
+ ext4_ext_pblock(path->p_ext));
} else
ext_debug(" []");
}
@@ -344,15 +643,52 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex), ext_pblock(ex));
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
ext_debug("\n");
}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t newblock, int level)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+
+ if (depth != level) {
+ struct ext4_extent_idx *idx;
+ idx = path[level].p_idx;
+ while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+ ext_debug("%d: move %d:%llu in new index %llu\n", level,
+ le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx),
+ newblock);
+ idx++;
+ }
+
+ return;
+ }
+
+ ex = path[depth].p_ext;
+ while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ newblock);
+ ex++;
+ }
+}
+
#else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -396,8 +732,8 @@ ext4_ext_binsearch_idx(struct inode *inode,
}
path->p_idx = l - 1;
- ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
- idx_pblock(path->p_idx));
+ ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
{
@@ -408,9 +744,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
if (k != 0 &&
le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
- printk("k=%d, ix=0x%p, first=0x%p\n", k,
- ix, EXT_FIRST_INDEX(eh));
- printk("%u <= %u\n",
+ printk(KERN_DEBUG "k=%d, ix=0x%p, "
+ "first=0x%p\n", k,
+ ix, EXT_FIRST_INDEX(eh));
+ printk(KERN_DEBUG "%u <= %u\n",
le32_to_cpu(ix->ei_block),
le32_to_cpu(ix[-1].ei_block));
}
@@ -463,9 +800,10 @@ ext4_ext_binsearch(struct inode *inode,
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:%d ",
+ ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
- ext_pblock(path->p_ext),
+ ext4_ext_pblock(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -495,25 +833,22 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
- ext4_ext_invalidate_cache(inode);
return 0;
}
struct ext4_ext_path *
ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
short int depth, i, ppos = 0, alloc = 0;
+ int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
- if (ext4_ext_check_header(inode, eh, depth))
- return ERR_PTR(-EIO);
-
/* account possible depth increase */
if (!path) {
@@ -524,6 +859,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -532,32 +868,39 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
ext4_ext_binsearch_idx(inode, path + ppos, block);
- path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_bread(inode->i_sb, path[ppos].p_block);
- if (!bh)
+ bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+ flags);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto err;
+ }
eh = ext_block_hdr(bh);
ppos++;
- BUG_ON(ppos > depth);
+ if (unlikely(ppos > depth)) {
+ put_bh(bh);
+ EXT4_ERROR_INODE(inode,
+ "ppos %d > depth %d", ppos, depth);
+ ret = -EIO;
+ goto err;
+ }
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
- i--;
-
- if (ext4_ext_check_header(inode, eh, i))
- goto err;
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -567,7 +910,7 @@ err:
ext4_ext_drop_refs(path);
if (alloc)
kfree(path);
- return ERR_PTR(-EIO);
+ return ERR_PTR(ret);
}
/*
@@ -576,8 +919,8 @@ err:
* check where to insert: before @curp or after @curp
*/
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *curp,
- int logical, ext4_fsblk_t ptr)
+ struct ext4_ext_path *curp,
+ int logical, ext4_fsblk_t ptr)
{
struct ext4_extent_idx *ix;
int len, err;
@@ -586,39 +929,54 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (err)
return err;
- BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
- len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+ if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d == ei_block %d!",
+ logical, le32_to_cpu(curp->p_idx->ei_block));
+ return -EIO;
+ }
+
+ if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+ >= le16_to_cpu(curp->p_hdr->eh_max))) {
+ EXT4_ERROR_INODE(inode,
+ "eh_entries %d >= eh_max %d!",
+ le16_to_cpu(curp->p_hdr->eh_entries),
+ le16_to_cpu(curp->p_hdr->eh_max));
+ return -EIO;
+ }
+
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
- len = (len - 1) * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d after: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- (curp->p_idx + 1), (curp->p_idx + 2));
- memmove(curp->p_idx + 2, curp->p_idx + 1, len);
- }
+ ext_debug("insert new index %d after: %llu\n", logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- len = len * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d before: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- curp->p_idx, (curp->p_idx + 1));
- memmove(curp->p_idx + 1, curp->p_idx, len);
+ ext_debug("insert new index %d before: %llu\n", logical, ptr);
ix = curp->p_idx;
}
+ len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+ BUG_ON(len < 0);
+ if (len > 0) {
+ ext_debug("insert new index %d: "
+ "move %d indices from 0x%p to 0x%p\n",
+ logical, len, ix, ix + 1);
+ memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+ }
+
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EIO;
+ }
+
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
- curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+ le16_add_cpu(&curp->p_hdr->eh_entries, 1);
- BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
- > le16_to_cpu(curp->p_hdr->eh_max));
- BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+ if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+ return -EIO;
+ }
err = ext4_ext_dirty(handle, inode, curp);
ext4_std_error(inode->i_sb, err);
@@ -637,14 +995,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
* - initializes subtree
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext, int at)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx;
- struct ext4_extent *ex;
int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock;
__le32 border;
@@ -656,7 +1014,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* if current leaf will be split, then we should use
* border from split point */
- BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+ return -EIO;
+ }
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
ext_debug("leaf will be split."
@@ -688,7 +1049,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err, flags);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -696,12 +1058,16 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* initialize new leaf */
newblock = ablocks[--a];
- BUG_ON(newblock == 0);
- bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
+ if (unlikely(newblock == 0)) {
+ EXT4_ERROR_INODE(inode, "newblock == 0!");
err = -EIO;
goto cleanup;
}
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -710,39 +1076,34 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
- ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */
- BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
- /* start copy from next extent */
- /* TODO: we could do it by single memmove */
- m = 0;
- path[depth].p_ext++;
- while (path[depth].p_ext <=
- EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:%d in new leaf %llu\n",
- le32_to_cpu(path[depth].p_ext->ee_block),
- ext_pblock(path[depth].p_ext),
- ext4_ext_get_actual_len(path[depth].p_ext),
- newblock);
- /*memmove(ex++, path[depth].p_ext++,
- sizeof(struct ext4_extent));
- neh->eh_entries++;*/
- path[depth].p_ext++;
- m++;
+ if (unlikely(path[depth].p_hdr->eh_entries !=
+ path[depth].p_hdr->eh_max)) {
+ EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+ path[depth].p_hdr->eh_entries,
+ path[depth].p_hdr->eh_max);
+ err = -EIO;
+ goto cleanup;
}
+ /* start copy from next extent */
+ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+ ext4_ext_show_move(inode, path, newblock, depth);
if (m) {
- memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
- neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+ struct ext4_extent *ex;
+ ex = EXT_FIRST_EXTENT(neh);
+ memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+ le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
@@ -753,8 +1114,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto cleanup;
- path[depth].p_hdr->eh_entries =
- cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+ le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto cleanup;
@@ -763,7 +1123,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* create intermediate indexes */
k = depth - at - 1;
- BUG_ON(k < 0);
+ if (unlikely(k < 0)) {
+ EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+ err = -EIO;
+ goto cleanup;
+ }
if (k)
ext_debug("create %d intermediate indices\n", k);
/* insert new index into current index block */
@@ -773,8 +1137,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -786,7 +1150,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
@@ -794,36 +1158,31 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
- /* copy indexes */
- m = 0;
- path[i].p_idx++;
+ /* move remainder of path[i] to the new index block */
+ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+ EXT_LAST_INDEX(path[i].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+ le32_to_cpu(path[i].p_ext->ee_block));
+ err = -EIO;
+ goto cleanup;
+ }
+ /* start copy indexes */
+ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
EXT_MAX_INDEX(path[i].p_hdr));
- BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
- EXT_LAST_INDEX(path[i].p_hdr));
- while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", i,
- le32_to_cpu(path[i].p_idx->ei_block),
- idx_pblock(path[i].p_idx),
- newblock);
- /*memmove(++fidx, path[i].p_idx++,
- sizeof(struct ext4_extent_idx));
- neh->eh_entries++;
- BUG_ON(neh->eh_entries > neh->eh_max);*/
- path[i].p_idx++;
- m++;
- }
+ ext4_ext_show_move(inode, path, newblock, i);
if (m) {
- memmove(++fidx, path[i].p_idx - m,
+ memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m);
- neh->eh_entries =
- cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+ le16_add_cpu(&neh->eh_entries, m);
}
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
@@ -834,7 +1193,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + i);
if (err)
goto cleanup;
- path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+ le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
err = ext4_ext_dirty(handle, inode, path + i);
if (err)
goto cleanup;
@@ -859,7 +1218,8 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
- ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+ ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
+ EXT4_FREE_BLOCKS_METADATA);
}
}
kfree(ablocks);
@@ -876,26 +1236,22 @@ cleanup:
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_extent *newext)
{
- struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
- struct ext4_extent_idx *fidx;
struct buffer_head *bh;
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, NULL,
+ newext, &err, flags);
if (newblock == 0)
return err;
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
- ext4_std_error(inode->i_sb, err);
- return err;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -905,50 +1261,43 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
}
/* move top-level index/leaf into new block */
- memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+ memmove(bh->b_data, EXT4_I(inode)->i_data,
+ sizeof(EXT4_I(inode)->i_data));
/* set size of new block */
neh = ext_block_hdr(bh);
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
+ ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto out;
- /* create index in new top-level index: num,max,pointer */
- err = ext4_ext_get_access(handle, inode, curp);
- if (err)
- goto out;
-
- curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
- curp->p_hdr->eh_entries = cpu_to_le16(1);
- curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
- if (path[0].p_hdr->eh_depth)
- curp->p_idx->ei_block =
- EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
- else
- curp->p_idx->ei_block =
- EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
- ext4_idx_store_pblock(curp->p_idx, newblock);
-
+ /* Update top-level index: num,max,pointer */
neh = ext_inode_hdr(inode);
- fidx = EXT_FIRST_INDEX(neh);
+ neh->eh_entries = cpu_to_le16(1);
+ ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+ if (neh->eh_depth == 0) {
+ /* Root extent block becomes index block */
+ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+ EXT_FIRST_INDEX(neh)->ei_block =
+ EXT_FIRST_EXTENT(neh)->ee_block;
+ }
ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
- le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+ le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+ ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(path->p_depth + 1);
- err = ext4_ext_dirty(handle, inode, curp);
+ le16_add_cpu(&neh->eh_depth, 1);
+ ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -961,8 +1310,10 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int mb_flags,
+ unsigned int gb_flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -982,18 +1333,20 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, path, newext, i);
+ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
if (err)
goto out;
@@ -1001,7 +1354,7 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1029,15 +1382,18 @@ out:
* returns 0 at @phys
* return value contains 0 (success) or error code
*/
-int
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_left(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *logical, ext4_fsblk_t *phys)
{
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
int depth, ee_len;
- BUG_ON(path == NULL);
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
depth = path->p_depth;
*phys = 0;
@@ -1051,40 +1407,63 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
if (*logical < le32_to_cpu(ex->ee_block)) {
- BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+ *logical, le32_to_cpu(ex->ee_block));
+ return -EIO;
+ }
while (--depth >= 0) {
ix = path[depth].p_idx;
- BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+ ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
+ EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ depth);
+ return -EIO;
+ }
}
return 0;
}
- BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
- *phys = ext_pblock(ex) + ee_len - 1;
+ *phys = ext4_ext_pblock(ex) + ee_len - 1;
return 0;
}
/*
* search the closest allocated block to the right for *logical
* and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
* returns 0 at @phys
* return value contains 0 (success) or error code
*/
-int
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_right(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *logical, ext4_fsblk_t *phys,
+ struct ext4_extent **ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
ext4_fsblk_t block;
- int depth, ee_len;
+ int depth; /* Note, NOT eh_depth; depth from top of tree */
+ int ee_len;
- BUG_ON(path == NULL);
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
depth = path->p_depth;
*phys = 0;
@@ -1098,77 +1477,82 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
if (*logical < le32_to_cpu(ex->ee_block)) {
- BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "first_extent(path[%d].p_hdr) != ex",
+ depth);
+ return -EIO;
+ }
while (--depth >= 0) {
ix = path[depth].p_idx;
- BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix != EXT_FIRST_INDEX *logical %d!",
+ *logical);
+ return -EIO;
+ }
}
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext_pblock(ex);
- return 0;
+ goto found_extent;
}
- BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
/* next allocated block in this leaf */
ex++;
- *logical = le32_to_cpu(ex->ee_block);
- *phys = ext_pblock(ex);
- return 0;
+ goto found_extent;
}
/* go up and search for index to the right */
while (--depth >= 0) {
ix = path[depth].p_idx;
if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
- break;
+ goto got_index;
}
- if (depth < 0) {
- /* we've gone up to the root and
- * found no index to the right */
- return 0;
- }
+ /* we've gone up to the root and found no index to the right */
+ return 0;
+got_index:
/* we've found index to the right, let's
* follow it and find the closest allocated
* block to the right */
ix++;
- block = idx_pblock(ix);
+ block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ /* subtract from p_depth to get proper eh_depth */
+ bh = read_extent_tree_block(inode, block,
+ path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check_header(inode, eh, depth)) {
- put_bh(bh);
- return -EIO;
- }
ix = EXT_FIRST_INDEX(eh);
- block = idx_pblock(ix);
+ block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
- put_bh(bh);
- return -EIO;
- }
ex = EXT_FIRST_EXTENT(eh);
+found_extent:
*logical = le32_to_cpu(ex->ee_block);
- *phys = ext_pblock(ex);
- put_bh(bh);
+ *phys = ext4_ext_pblock(ex);
+ *ret_ex = ex;
+ if (bh)
+ put_bh(bh);
return 0;
-
}
/*
* ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
* NOTE: it considers block number from index entry as
* allocated block. Thus, index entries have to be consistent
* with leaves.
@@ -1182,12 +1566,13 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth = path->p_depth;
if (depth == 0 && path->p_ext == NULL)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
while (depth >= 0) {
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext !=
+ if (path[depth].p_ext &&
+ path[depth].p_ext !=
EXT_LAST_EXTENT(path[depth].p_hdr))
return le32_to_cpu(path[depth].p_ext[1].ee_block);
} else {
@@ -1199,15 +1584,14 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
depth--;
}
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}
/*
* ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
*/
-static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
- struct ext4_ext_path *path)
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
{
int depth;
@@ -1216,7 +1600,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
/* zero-tree has no leaf blocks at all */
if (depth == 0)
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
/* go to index block */
depth--;
@@ -1229,7 +1613,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
depth--;
}
- return EXT_MAX_BLOCK;
+ return EXT_MAX_BLOCKS;
}
/*
@@ -1249,8 +1633,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
- BUG_ON(ex == NULL);
- BUG_ON(eh == NULL);
+
+ if (unlikely(ex == NULL || eh == NULL)) {
+ EXT4_ERROR_INODE(inode,
+ "ex %p == NULL or eh %p == NULL", ex, eh);
+ return -EIO;
+ }
if (depth == 0) {
/* there is no tree at all */
@@ -1291,24 +1679,21 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
-static int
+int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- unsigned short ext1_ee_len, ext2_ee_len, max_len;
+ unsigned short ext1_ee_len, ext2_ee_len;
/*
- * Make sure that either both extents are uninitialized, or
- * both are _not_.
+ * Make sure that both extents are initialized. We don't merge
+ * unwritten extents so that we can be sure that end_io code has
+ * the extent that was written properly split out and conversion to
+ * initialized is trivial.
*/
- if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
return 0;
- if (ext4_ext_is_uninitialized(ex1))
- max_len = EXT_UNINIT_MAX_LEN;
- else
- max_len = EXT_INIT_MAX_LEN;
-
ext1_ee_len = ext4_ext_get_actual_len(ex1);
ext2_ee_len = ext4_ext_get_actual_len(ex2);
@@ -1321,14 +1706,19 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (ext1_ee_len + ext2_ee_len > max_len)
+ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+ return 0;
+ if (ext4_ext_is_unwritten(ex1) &&
+ (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+ atomic_read(&EXT4_I(inode)->i_unwritten) ||
+ (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
return 0;
#endif
- if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+ if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
return 1;
return 0;
}
@@ -1340,14 +1730,13 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
* 1 if they got merged.
*/
-int ext4_ext_try_to_merge(struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *ex)
+static int ext4_ext_try_to_merge_right(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex)
{
struct ext4_extent_header *eh;
unsigned int depth, len;
- int merge_done = 0;
- int uninitialized = 0;
+ int merge_done = 0, unwritten;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1357,30 +1746,98 @@ int ext4_ext_try_to_merge(struct inode *inode,
if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
break;
/* merge with next extent! */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(ex + 1));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
if (ex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - ex - 1)
* sizeof(struct ext4_extent);
memmove(ex + 1, ex + 2, len);
}
- eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
+ le16_add_cpu(&eh->eh_entries, -1);
merge_done = 1;
WARN_ON(eh->eh_entries == 0);
if (!eh->eh_entries)
- ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
- "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+ EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
}
return merge_done;
}
/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ size_t s;
+ unsigned max_root = ext4_ext_space_root(inode, 0);
+ ext4_fsblk_t blk;
+
+ if ((path[0].p_depth != 1) ||
+ (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+ (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+ return;
+
+ /*
+ * We need to modify the block allocation bitmap and the block
+ * group descriptor to release the extent tree block. If we
+ * can't get the journal credits, give up.
+ */
+ if (ext4_journal_extend(handle, 2))
+ return;
+
+ /*
+ * Copy the extent data up to the inode
+ */
+ blk = ext4_idx_pblock(path[0].p_idx);
+ s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+ sizeof(struct ext4_extent_idx);
+ s += sizeof(struct ext4_extent_header);
+
+ memcpy(path[0].p_hdr, path[1].p_hdr, s);
+ path[0].p_depth = 0;
+ path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+ (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+ path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+ brelse(path[1].p_bh);
+ ext4_free_blocks(handle, inode, NULL, blk, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+ EXT4_FREE_BLOCKS_RESERVE);
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static void ext4_ext_try_to_merge(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex) {
+ struct ext4_extent_header *eh;
+ unsigned int depth;
+ int merge_done = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ if (ex > EXT_FIRST_EXTENT(eh))
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+ if (!merge_done)
+ (void) ext4_ext_try_to_merge_right(inode, path, ex);
+
+ ext4_ext_try_to_merge_up(handle, inode, path);
+}
+
+/*
* check if a portion of the "newext" extent overlaps with an
* existing extent.
*
@@ -1388,9 +1845,10 @@ int ext4_ext_try_to_merge(struct inode *inode,
* such that there will be no overlap, and then returns 1.
* If there is no overlap found, it returns 0.
*/
-unsigned int ext4_ext_check_overlap(struct inode *inode,
- struct ext4_extent *newext,
- struct ext4_ext_path *path)
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+ struct inode *inode,
+ struct ext4_extent *newext,
+ struct ext4_ext_path *path)
{
ext4_lblk_t b1, b2;
unsigned int depth, len1;
@@ -1401,21 +1859,22 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
depth = ext_depth(inode);
if (!path[depth].p_ext)
goto out;
- b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+ b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
/*
* get the next allocated block if the extent in the path
- * is before the requested block(s)
+ * is before the requested block(s)
*/
if (b2 < b1) {
b2 = ext4_ext_next_allocated_block(path);
- if (b2 == EXT_MAX_BLOCK)
+ if (b2 == EXT_MAX_BLOCKS)
goto out;
+ b2 = EXT4_LBLK_CMASK(sbi, b2);
}
/* check for wrap through zero on extent logical start block*/
if (b1 + len1 < b1) {
- len1 = EXT_MAX_BLOCK - b1;
+ len1 = EXT_MAX_BLOCKS - b1;
newext->ee_len = cpu_to_le16(len1);
ret = 1;
}
@@ -1437,48 +1896,104 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int gb_flags)
{
- struct ext4_extent_header * eh;
+ struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
int depth, len, err;
ext4_lblk_t next;
- unsigned uninitialized = 0;
+ int mb_flags = 0, unwritten;
- BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+ if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+ EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+ return -EIO;
+ }
depth = ext_depth(inode);
ex = path[depth].p_ext;
- BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append %d block to %d:%d (from %llu)\n",
- ext4_ext_get_actual_len(newext),
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex), ext_pblock(ex));
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- return err;
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
/*
- * ext4_can_extents_be_merged should have checked that either
- * both extents are uninitialized, or both aren't. Thus we
- * need to check only one of them here.
+ * Try to see whether we should rather test the extent on
+ * right from ex, or from the left of ex. This is because
+ * ext4_ext_find_extent() can return either extent on the
+ * left, or on the right from the searched position. This
+ * will make merging more effective.
*/
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
- ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ if (ex < EXT_LAST_EXTENT(eh) &&
+ (le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex) <
+ le32_to_cpu(newext->ee_block))) {
+ ex += 1;
+ goto prepend;
+ } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+ (le32_to_cpu(newext->ee_block) +
+ ext4_ext_get_actual_len(newext) <
+ le32_to_cpu(ex->ee_block)))
+ ex -= 1;
+
+ /* Try to append newex to the ex */
+ if (ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+ unwritten = ext4_ext_is_unwritten(ex);
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
- eh = path[depth].p_hdr;
- nearex = ex;
- goto merge;
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+prepend:
+ /* Try to prepend newex to the ex */
+ if (ext4_can_extents_be_merged(inode, newext, ex)) {
+ ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ "(from %llu)\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_unwritten(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode,
+ path + depth);
+ if (err)
+ return err;
+
+ unwritten = ext4_ext_is_unwritten(ex);
+ ex->ee_block = newext->ee_block;
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
}
-repeat:
depth = ext_depth(inode);
eh = path[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1486,21 +2001,22 @@ repeat:
/* probably next leaf has space for us? */
fex = EXT_LAST_EXTENT(eh);
- next = ext4_ext_next_leaf_block(inode, path);
- if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
- && next != EXT_MAX_BLOCK) {
- ext_debug("next leaf block - %d\n", next);
+ next = EXT_MAX_BLOCKS;
+ if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
+ next = ext4_ext_next_leaf_block(path);
+ if (next != EXT_MAX_BLOCKS) {
+ ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
- ext_debug("next leaf isnt full(%d)\n",
+ ext_debug("next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
path = npath;
- goto repeat;
+ goto has_space;
}
ext_debug("next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1510,7 +2026,10 @@ repeat:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ mb_flags = EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -1525,81 +2044,233 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
- ext_pblock(newext),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext));
- path[depth].p_ext = EXT_FIRST_EXTENT(eh);
- } else if (le32_to_cpu(newext->ee_block)
+ nearex = EXT_FIRST_EXTENT(eh);
+ } else {
+ if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
-/* BUG_ON(newext->ee_block == nearex->ee_block); */
- if (nearex != EXT_LAST_EXTENT(eh)) {
- len = EXT_MAX_EXTENT(eh) - nearex;
- len = (len - 1) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
+ /* Insert after */
+ ext_debug("insert %u:%llu:[%d]%d before: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ nearex++;
+ } else {
+ /* Insert before */
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ ext_debug("insert %u:%llu:[%d]%d after: "
+ "nearest %p\n",
le32_to_cpu(newext->ee_block),
- ext_pblock(newext),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 2, nearex + 1, len);
+ nearex);
+ }
+ len = EXT_LAST_EXTENT(eh) - nearex + 1;
+ if (len > 0) {
+ ext_debug("insert %u:%llu:[%d]%d: "
+ "move %d extents from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_unwritten(newext),
+ ext4_ext_get_actual_len(newext),
+ len, nearex, nearex + 1);
+ memmove(nearex + 1, nearex,
+ len * sizeof(struct ext4_extent));
}
- path[depth].p_ext = nearex + 1;
- } else {
- BUG_ON(newext->ee_block == nearex->ee_block);
- len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
- le32_to_cpu(newext->ee_block),
- ext_pblock(newext),
- ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 1, nearex, len);
- path[depth].p_ext = nearex;
}
- eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
- nearex = path[depth].p_ext;
+ le16_add_cpu(&eh->eh_entries, 1);
+ path[depth].p_ext = nearex;
nearex->ee_block = newext->ee_block;
- ext4_ext_store_pblock(nearex, ext_pblock(newext));
+ ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
nearex->ee_len = newext->ee_len;
merge:
- /* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ /* try to merge extents */
+ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(handle, inode, path, nearex);
- /* try to merge extents to the left */
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto cleanup;
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
if (npath) {
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_tree_changed(inode);
- ext4_ext_invalidate_cache(inode);
return err;
}
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
- __u32 len, ext4_fsblk_t start, int type)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
{
- struct ext4_ext_cache *cex;
- BUG_ON(len == 0);
- cex = &EXT4_I(inode)->i_cached_extent;
- cex->ec_type = type;
- cex->ec_block = block;
- cex->ec_len = len;
- cex->ec_start = start;
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ struct extent_status es;
+ ext4_lblk_t next, next_del, start = 0, end = 0;
+ ext4_lblk_t last = block + num;
+ int exists, depth = 0, err = 0;
+ unsigned int flags = 0;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+
+ while (block < last && block != EXT_MAX_BLOCKS) {
+ num = last - block;
+ /* find extent for this block */
+ down_read(&EXT4_I(inode)->i_data_sem);
+
+ if (path && ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
+ path = ext4_ext_find_extent(inode, block, path, 0);
+ if (IS_ERR(path)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ err = PTR_ERR(path);
+ path = NULL;
+ break;
+ }
+
+ depth = ext_depth(inode);
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ err = -EIO;
+ break;
+ }
+ ex = path[depth].p_ext;
+ next = ext4_ext_next_allocated_block(path);
+ ext4_ext_drop_refs(path);
+
+ flags = 0;
+ exists = 0;
+ if (!ex) {
+ /* there is no extent yet, so try to allocate
+ * all requested space */
+ start = block;
+ end = block + num;
+ } else if (le32_to_cpu(ex->ee_block) > block) {
+ /* need to allocate space before found extent */
+ start = block;
+ end = le32_to_cpu(ex->ee_block);
+ if (block + num < end)
+ end = block + num;
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
+ /* need to allocate space after found extent */
+ start = block;
+ end = block + num;
+ if (end >= next)
+ end = next;
+ } else if (block >= le32_to_cpu(ex->ee_block)) {
+ /*
+ * some part of requested space is covered
+ * by found extent
+ */
+ start = block;
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
+ if (block + num < end)
+ end = block + num;
+ exists = 1;
+ } else {
+ BUG();
+ }
+ BUG_ON(end <= start);
+
+ if (!exists) {
+ es.es_lblk = start;
+ es.es_len = end - start;
+ es.es_pblk = 0;
+ } else {
+ es.es_lblk = le32_to_cpu(ex->ee_block);
+ es.es_len = ext4_ext_get_actual_len(ex);
+ es.es_pblk = ext4_ext_pblock(ex);
+ if (ext4_ext_is_unwritten(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ }
+
+ /*
+ * Find delayed extent and update es accordingly. We call
+ * it even in !exists case to find out whether es is the
+ * last existing extent or not.
+ */
+ next_del = ext4_find_delayed_extent(inode, &es);
+ if (!exists && next_del) {
+ exists = 1;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ }
+ up_read(&EXT4_I(inode)->i_data_sem);
+
+ if (unlikely(es.es_len == 0)) {
+ EXT4_ERROR_INODE(inode, "es.es_len == 0");
+ err = -EIO;
+ break;
+ }
+
+ /*
+ * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+ * we need to check next == EXT_MAX_BLOCKS because it is
+ * possible that an extent is with unwritten and delayed
+ * status due to when an extent is delayed allocated and
+ * is allocated by fallocate status tree will track both of
+ * them in a extent.
+ *
+ * So we could return a unwritten and delayed extent, and
+ * its block is equal to 'next'.
+ */
+ if (next == next_del && next == EXT_MAX_BLOCKS) {
+ flags |= FIEMAP_EXTENT_LAST;
+ if (unlikely(next_del != EXT_MAX_BLOCKS ||
+ next != EXT_MAX_BLOCKS)) {
+ EXT4_ERROR_INODE(inode,
+ "next extent == %u, next "
+ "delalloc extent = %u",
+ next, next_del);
+ err = -EIO;
+ break;
+ }
+ }
+
+ if (exists) {
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
+ flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
+ }
+
+ block = es.es_lblk + es.es_len;
+ }
+
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ return err;
}
/*
@@ -1612,15 +2283,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len;
- ext4_lblk_t lblock;
+ unsigned long len = 0;
+ ext4_lblk_t lblock = 0;
struct ext4_extent *ex;
ex = path[depth].p_ext;
if (ex == NULL) {
- /* there is no extent yet, so gap is [0;-] */
- lblock = 0;
- len = EXT_MAX_BLOCK;
+ /*
+ * there is no extent yet, so gap is [0;-] and we
+ * don't cache it
+ */
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -1629,6 +2301,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -1642,135 +2317,175 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else {
- lblock = len = 0;
BUG();
}
ext_debug(" -> %u:%lu\n", lblock, len);
- ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
-}
-
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache *cex;
-
- cex = &EXT4_I(inode)->i_cached_extent;
-
- /* has cache valid data? */
- if (cex->ec_type == EXT4_EXT_CACHE_NO)
- return EXT4_EXT_CACHE_NO;
-
- BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
- cex->ec_type != EXT4_EXT_CACHE_EXTENT);
- if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
- ext_debug("%u cached by %u:%u:%llu\n",
- block,
- cex->ec_block, cex->ec_len, cex->ec_start);
- return cex->ec_type;
- }
-
- /* not in cache */
- return EXT4_EXT_CACHE_NO;
}
/*
* ext4_ext_rm_idx:
* removes index from the index block.
- * It's used in truncate case only, thus all requests are for
- * last index in the block only.
*/
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int depth)
{
- struct buffer_head *bh;
int err;
ext4_fsblk_t leaf;
/* free index block */
- path--;
- leaf = idx_pblock(path->p_idx);
- BUG_ON(path->p_hdr->eh_entries == 0);
+ depth--;
+ path = path + depth;
+ leaf = ext4_idx_pblock(path->p_idx);
+ if (unlikely(path->p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ return -EIO;
+ }
err = ext4_ext_get_access(handle, inode, path);
if (err)
return err;
- path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+
+ if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+ int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ len *= sizeof(struct ext4_extent_idx);
+ memmove(path->p_idx, path->p_idx + 1, len);
+ }
+
+ le16_add_cpu(&path->p_hdr->eh_entries, -1);
err = ext4_ext_dirty(handle, inode, path);
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
- bh = sb_find_get_block(inode->i_sb, leaf);
- ext4_forget(handle, 1, inode, bh, leaf);
- ext4_free_blocks(handle, inode, leaf, 1, 1);
+ trace_ext4_ext_rm_idx(inode, leaf);
+
+ ext4_free_blocks(handle, inode, NULL, leaf, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ while (--depth >= 0) {
+ if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ break;
+ path--;
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err)
+ break;
+ path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path);
+ if (err)
+ break;
+ }
return err;
}
/*
- * ext4_ext_calc_credits_for_insert:
- * This routine returns max. credits that the extent tree can consume.
- * It should be OK for low-performance paths like ->writepage()
- * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under i_data_sem and
- * pass the actual path.
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
*/
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
struct ext4_ext_path *path)
{
- int depth, needed;
-
if (path) {
+ int depth = ext_depth(inode);
+ int ret = 0;
+
/* probably there is space in leaf? */
- depth = ext_depth(inode);
if (le16_to_cpu(path[depth].p_hdr->eh_entries)
- < le16_to_cpu(path[depth].p_hdr->eh_max))
- return 1;
+ < le16_to_cpu(path[depth].p_hdr->eh_max)) {
+
+ /*
+ * There are some space in the leaf tree, no
+ * need to account for leaf block credit
+ *
+ * bitmaps and block group descriptor blocks
+ * and other metadata blocks still need to be
+ * accounted.
+ */
+ /* 1 bitmap, 1 block group descriptor */
+ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+ return ret;
+ }
}
- /*
- * given 32-bit logical block (4294967296 blocks), max. tree
- * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
- * Let's also add one more level for imbalance.
- */
- depth = 5;
+ return ext4_chunk_trans_blocks(inode, nrblocks);
+}
- /* allocation of new data block(s) */
- needed = 2;
+/*
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
+ *
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
+ *
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
+{
+ int index;
+ int depth;
- /*
- * tree can be full, so it would need to grow in depth:
- * we need one credit to modify old root, credits for
- * new root will be added in split accounting
- */
- needed += 1;
+ /* If we are converting the inline data, only one is needed here. */
+ if (ext4_has_inline_data(inode))
+ return 1;
- /*
- * Index split can happen, we would need:
- * allocate intermediate indexes (bitmap + group)
- * + change two blocks at each level, but root (already included)
- */
- needed += (depth * 2) + (depth * 2);
+ depth = ext_depth(inode);
- /* any allocation modifies superblock */
- needed += 1;
+ if (extents <= 1)
+ index = depth * 2;
+ else
+ index = depth * 3;
- return needed;
+ return index;
+}
+
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
}
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
- struct ext4_extent *ex,
- ext4_lblk_t from, ext4_lblk_t to)
+ struct ext4_extent *ex,
+ long long *partial_cluster,
+ ext4_lblk_t from, ext4_lblk_t to)
{
- struct buffer_head *bh;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- int i, metadata = 0;
+ ext4_fsblk_t pblk;
+ int flags = get_default_free_blocks_flags(inode);
+
+ /*
+ * For bigalloc file systems, we never free a partial cluster
+ * at the beginning of the extent. Instead, we make a note
+ * that we tried freeing the cluster, and check to see if we
+ * need to free it on a subsequent call to ext4_remove_blocks,
+ * or at the end of the ext4_truncate() operation.
+ */
+ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+ trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+ /*
+ * If we have a partial cluster, and it's different from the
+ * cluster of the last block, we need to explicitly free the
+ * partial cluster here.
+ */
+ pblk = ext4_ext_pblock(ex) + ee_len - 1;
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- metadata = 1;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1790,136 +2505,236 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
- ext4_fsblk_t start;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
- start = ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, start);
- for (i = 0; i < num; i++) {
- bh = sb_find_get_block(inode->i_sb, start + i);
- ext4_forget(handle, 0, inode, bh, start + i);
- }
- ext4_free_blocks(handle, inode, start, num, metadata);
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+ /*
+ * If the block range to be freed didn't start at the
+ * beginning of a cluster, and we removed the entire
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
+ */
+ unaligned = EXT4_PBLK_COFF(sbi, pblk);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
+ *partial_cluster = EXT4_B2C(sbi, pblk);
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
+ *partial_cluster = 0;
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode: The files inode
+ * @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
+ * @start: The first block to remove
+ * @end: The last block to remove
+ */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_lblk_t start)
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
+ ext4_lblk_t start, ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
struct ext4_extent_header *eh;
- ext4_lblk_t a, b, block;
+ ext4_lblk_t a, b;
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
- unsigned uninitialized = 0;
+ unsigned unwritten = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
- ext_debug("truncate since %u in leaf\n", start);
+ ext_debug("truncate since %u in leaf to %u\n", start, end);
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
- BUG_ON(eh == NULL);
-
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
ex_ee_len = ext4_ext_get_actual_len(ex);
+ /*
+ * If we're starting with an extent other than the last one in the
+ * node, we need to see if it shares a cluster with the extent to
+ * the right (towards the end of the file). If its leftmost cluster
+ * is this extent's rightmost cluster and it is not cluster aligned,
+ * we'll mark it as a partial that is not to be deallocated.
+ */
+
+ if (ex != EXT_LAST_EXTENT(eh)) {
+ ext4_fsblk_t current_pblk, right_pblk;
+ long long current_cluster, right_cluster;
+
+ current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+ right_pblk = ext4_ext_pblock(ex + 1);
+ right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+ if (current_cluster == right_cluster &&
+ EXT4_PBLK_COFF(sbi, right_pblk))
+ *partial_cluster = -right_cluster;
+ }
+
+ trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
- ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+
+ if (ext4_ext_is_unwritten(ex))
+ unwritten = 1;
+ else
+ unwritten = 0;
+
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ unwritten, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
- ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
ext_debug(" border %u:%u\n", a, b);
- if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
- block = 0;
- num = 0;
- BUG();
+ /* If this extent is beyond the end of the hole, skip it */
+ if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (EXT4_PBLK_COFF(sbi, pblk))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ continue;
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ EXT4_ERROR_INODE(inode,
+ "can not handle truncate %u:%u "
+ "on extent %u:%u",
+ start, end, ex_ee_block,
+ ex_ee_block + ex_ee_len - 1);
+ err = -EIO;
+ goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
- block = ex_ee_block;
- num = a - block;
- } else if (b != ex_ee_block + ex_ee_len - 1) {
- /* remove head of the extent */
- block = a;
- num = b - a;
- /* there is no "make a hole" API yet */
- BUG();
+ num = a - ex_ee_block;
} else {
/* remove whole extent: excellent! */
- block = ex_ee_block;
num = 0;
- BUG_ON(a != ex_ee_block);
- BUG_ON(b != ex_ee_block + ex_ee_len - 1);
}
-
- /* at present, extent can't cross block group: */
- /* leaf + bitmap + group desc + sb + inode */
- credits = 5;
+ /*
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups plus ex_ee_len/blocks_per_block_group for
+ * the worst case
+ */
+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
if (ex == EXT_FIRST_EXTENT(eh)) {
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
-#ifdef CONFIG_QUOTA
- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- err = ext4_remove_blocks(handle, inode, ex, a, b);
+ err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+ a, b);
if (err)
goto out;
- if (num == 0) {
+ if (num == 0)
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
- }
- ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
/*
- * Do not mark uninitialized if all the blocks in the
+ * Do not mark unwritten if all the blocks in the
* extent have been removed.
*/
- if (uninitialized && num)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten && num)
+ ext4_ext_mark_unwritten(ex);
+ /*
+ * If the extent was completely released,
+ * we need to remove it from the leaf
+ */
+ if (num == 0) {
+ if (end != EXT_MAX_BLOCKS - 1) {
+ /*
+ * For hole punching, we need to scoot all the
+ * extents up when an extent is removed so that
+ * we dont have blank extents in the middle
+ */
+ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+ sizeof(struct ext4_extent));
+
+ /* Now get rid of the one at the end */
+ memset(EXT_LAST_EXTENT(eh), 0,
+ sizeof(struct ext4_extent));
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ } else if (*partial_cluster > 0)
+ *partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto out;
- ext_debug("new extent: %u:%u:%llu\n", block, num,
- ext_pblock(ex));
+ ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+ ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -1928,10 +2743,30 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (correct_index && eh->eh_entries)
err = ext4_ext_correct_indexes(handle, inode, path);
+ /*
+ * If there's a partial cluster and at least one extent remains in
+ * the leaf, free the partial cluster if it isn't shared with the
+ * current extent. If there's a partial cluster and no extents
+ * remain in the leaf, it can't be freed here. It can only be
+ * freed when it's possible to determine if it's not shared with
+ * any other extent - when the next leaf is processed or when space
+ * removal is complete.
+ */
+ if (*partial_cluster > 0 && eh->eh_entries &&
+ (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+ *partial_cluster)) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
- err = ext4_ext_rm_idx(handle, inode, path + depth);
+ err = ext4_ext_rm_idx(handle, inode, path, depth);
out:
return err;
@@ -1958,43 +2793,122 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
- struct ext4_ext_path *path;
+ struct ext4_ext_path *path = NULL;
+ long long partial_cluster = 0;
handle_t *handle;
int i = 0, err = 0;
- ext_debug("truncate since %u\n", start);
+ ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, depth + 1);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
- ext4_ext_invalidate_cache(inode);
+again:
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
+ * Check if we are removing extents inside the extent tree. If that
+ * is the case, we are going to punch a hole inside the extent tree
+ * so we have to check whether we need to split the extent covering
+ * the last block to remove so we can easily remove the part of it
+ * in ext4_ext_rm_leaf().
+ */
+ if (end < EXT_MAX_BLOCKS - 1) {
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+ if (IS_ERR(path)) {
+ ext4_journal_stop(handle);
+ return PTR_ERR(path);
+ }
+ depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
+ ex = path[depth].p_ext;
+ if (!ex) {
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
+ }
+
+ ee_block = le32_to_cpu(ex->ee_block);
+
+ /*
+ * See if the last block is inside the extent, if so split
+ * the extent at 'end' block so we can easily remove the
+ * tail of the first part of the split extent in
+ * ext4_ext_rm_leaf().
+ */
+ if (end >= ee_block &&
+ end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ int split_flag = 0;
+
+ if (ext4_ext_is_unwritten(ex))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+
+ /*
+ * Split the extent in two so that 'end' is the last
+ * block in the first new extent. Also we should not
+ * fail removing space due to ENOSPC so try to use
+ * reserved block if that happens.
+ */
+ err = ext4_split_extent_at(handle, inode, path,
+ end + 1, split_flag,
+ EXT4_EX_NOCACHE |
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
+
+ if (err < 0)
+ goto out;
+ }
+ }
+ /*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
- path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
- if (path == NULL) {
- ext4_journal_stop(handle);
- return -ENOMEM;
- }
- path[0].p_hdr = ext_inode_hdr(inode);
- if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
- err = -EIO;
- goto out;
+ depth = ext_depth(inode);
+ if (path) {
+ int k = i = depth;
+ while (--k > 0)
+ path[k].p_block =
+ le16_to_cpu(path[k].p_hdr->eh_entries)+1;
+ } else {
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
+ path[0].p_depth = depth;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ i = 0;
+
+ if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
+ err = -EIO;
+ goto out;
+ }
}
- path[0].p_depth = depth;
+ err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
/* this is leaf block */
- err = ext4_ext_rm_leaf(handle, inode, path, start);
+ err = ext4_ext_rm_leaf(handle, inode, path,
+ &partial_cluster, start,
+ end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2027,23 +2941,23 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
struct buffer_head *bh;
/* go to the next level */
ext_debug("move to level %d (block %llu)\n",
- i + 1, idx_pblock(path[i].p_idx));
+ i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = sb_bread(sb, idx_pblock(path[i].p_idx));
- if (!bh) {
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(bh)) {
/* should we reset i_size? */
- err = -EIO;
+ err = PTR_ERR(bh);
break;
}
+ /* Yield here to deal with large extent trees.
+ * Should be a no-op if we did IO above. */
+ cond_resched();
if (WARN_ON(i + 1 > depth)) {
err = -EIO;
break;
}
- if (ext4_ext_check_header(inode, ext_block_hdr(bh),
- depth - i - 1)) {
- err = -EIO;
- break;
- }
path[i + 1].p_bh = bh;
/* save actual number of indexes since this
@@ -2056,7 +2970,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
/* index is empty, remove it;
* handle must be already prepared by the
* truncatei_leaf() */
- err = ext4_ext_rm_idx(handle, inode, path + i);
+ err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
@@ -2066,6 +2980,21 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
}
}
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
+
+ /* If we still have something in the partial cluster and we have removed
+ * even the first extent, then we should free the blocks in the partial
+ * cluster as well. */
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(EXT4_SB(sb), partial_cluster),
+ EXT4_SB(sb)->s_cluster_ratio, flags);
+ partial_cluster = 0;
+ }
+
/* TODO: flexible tree reduction should be here */
if (path->p_hdr->eh_entries == 0) {
/*
@@ -2076,14 +3005,17 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (err == 0) {
ext_inode_hdr(inode)->eh_depth = 0;
ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(inode));
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
err = ext4_ext_dirty(handle, inode, path);
}
}
out:
- ext4_ext_tree_changed(inode);
ext4_ext_drop_refs(path);
kfree(path);
+ if (err == -EAGAIN) {
+ path = NULL;
+ goto again;
+ }
ext4_journal_stop(handle);
return err;
@@ -2098,18 +3030,20 @@ void ext4_ext_init(struct super_block *sb)
* possible initialization would be here
*/
- if (test_opt(sb, EXTENTS)) {
- printk("EXT4-fs: file extents enabled");
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+ printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
- printk(", aggressive tests");
+ ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
- printk(", check binsearch");
+ ", check binsearch"
#endif
#ifdef EXTENTS_STATS
- printk(", stats");
+ ", stats"
+#endif
+ "\n");
#endif
- printk("\n");
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2123,7 +3057,7 @@ void ext4_ext_init(struct super_block *sb)
*/
void ext4_ext_release(struct super_block *sb)
{
- if (!test_opt(sb, EXTENTS))
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
return;
#ifdef EXTENTS_STATS
@@ -2138,155 +3072,1175 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ if (ee_len == 0)
+ return 0;
+
+ return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+ int ret;
+
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
/*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
- * uninitialized).
- * There are three possibilities:
- * a> There is no split required: Entire extent should be initialized
- * b> Splits in two extents: Write is happening at either end of the extent
- * c> Splits in three extents: Somone is writing in middle of the extent
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ * the states(init or unwritten) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ * a> the extent are splitted into two extent.
+ * b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
*/
-static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t iblock,
- unsigned long max_blocks)
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags)
{
- struct ext4_extent *ex, newex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
- unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex, newex, orig_ex, zero_ex;
+ struct ext4_extent *ex2 = NULL;
+ unsigned int ee_len, depth;
int err = 0;
- int ret = 0;
+
+ BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+ (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
+ ext_debug("ext4_split_extents_at: inode %lu, logical"
+ "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+ ext4_ext_show_leaf(inode, path);
depth = ext_depth(inode);
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (iblock - ee_block);
- newblock = iblock - ee_block + ext_pblock(ex);
- ex2 = ex;
+ newblock = split - ee_block + ext4_ext_pblock(ex);
+
+ BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+ BUG_ON(!ext4_ext_is_unwritten(ex) &&
+ split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- /* ex1: ee_block to iblock - 1 : uninitialized */
- if (iblock > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
+ if (split == ee_block) {
+ /*
+ * case b: block @split is the block that the extent begins with
+ * then we just change the state of the extent, and splitting
+ * is not needed.
+ */
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ goto out;
}
+
+ /* case a */
+ memcpy(&orig_ex, ex, sizeof(orig_ex));
+ ex->ee_len = cpu_to_le16(split - ee_block);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+ ext4_ext_mark_unwritten(ex);
+
/*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
+ * path may lead to new leaf, not to original leaf any more
+ * after ext4_ext_insert_extent() returns,
*/
- if (!ex1 && allocated > max_blocks)
- ex2->ee_len = cpu_to_le16(max_blocks);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > max_blocks) {
- unsigned int newdepth;
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock + max_blocks);
- ext4_ext_store_pblock(ex3, newblock + max_blocks);
- ex3->ee_len = cpu_to_le16(allocated - max_blocks);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto fix_extent_len;
+
+ ex2 = &newex;
+ ex2->ee_block = cpu_to_le32(split);
+ ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
+ ext4_ext_store_pblock(ex2, newblock);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex2);
+
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+ if (split_flag & EXT4_EXT_DATA_VALID1) {
+ err = ext4_ext_zeroout(inode, ex2);
+ zero_ex.ee_block = ex2->ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(ex2));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex2));
+ } else {
+ err = ext4_ext_zeroout(inode, ex);
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(ex));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ }
+ } else {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ zero_ex.ee_block = orig_ex.ee_block;
+ zero_ex.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(&orig_ex));
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(&orig_ex));
+ }
+
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ goto fix_extent_len;
+
+ /* update extent status tree */
+ err = ext4_zeroout_es(inode, &zero_ex);
+
+ goto out;
+ } else if (err)
+ goto fix_extent_len;
+
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+
+fix_extent_len:
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required
+ * b> Splits in two extents: Split is happening at either end of the extent
+ * c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags)
+{
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ int err = 0;
+ int unwritten;
+ int split_flag1, flags1;
+ int allocated = map->m_len;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ unwritten = ext4_ext_is_unwritten(ex);
+
+ if (map->m_lblk + map->m_len < ee_block + ee_len) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
+ flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ if (unwritten)
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+ if (split_flag & EXT4_EXT_DATA_VALID2)
+ split_flag1 |= EXT4_EXT_DATA_VALID1;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk + map->m_len, split_flag1, flags1);
+ if (err)
+ goto out;
+ } else {
+ allocated = ee_len - (map->m_lblk - ee_block);
+ }
+ /*
+ * Update path is required because previous ext4_split_extent_at() may
+ * result in split of original leaf or extent zeroout.
+ */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
+ split_flag1 = 0;
+
+ if (map->m_lblk >= ee_block) {
+ split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+ if (unwritten) {
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
+ split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNWRIT2);
+ }
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk, split_flag1, flags);
if (err)
goto out;
+ }
+
+ ext4_ext_show_leaf(inode, path);
+out:
+ return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
+ * to an unwritten extent. It may result in splitting the unwritten
+ * extent into multiple extents (up to three - one initialized and two
+ * unwritten).
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be initialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is unwritten.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
+ */
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path,
+ int flags)
+{
+ struct ext4_sb_info *sbi;
+ struct ext4_extent_header *eh;
+ struct ext4_map_blocks split_map;
+ struct ext4_extent zero_ex;
+ struct ext4_extent *ex, *abut_ex;
+ ext4_lblk_t ee_block, eof_block;
+ unsigned int ee_len, depth, map_len = map->m_len;
+ int allocated = 0, max_zeroout = 0;
+ int err = 0;
+ int split_flag = 0;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map_len);
+
+ sbi = EXT4_SB(inode->i_sb);
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map_len)
+ eof_block = map->m_lblk + map_len;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ zero_ex.ee_len = 0;
+
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_unwritten(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * unwritten extent to its neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. Transferring to the left is the common case in
+ * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+ * followed by append writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L2: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) &&
+ /* See if we can merge left */
+ (map_len < ee_len) && /*L1*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len;
+
+ abut_ex = ex - 1;
+ prev_lblk = le32_to_cpu(abut_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(abut_ex);
+ prev_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+
/*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
*/
- newdepth = ext_depth(inode);
- if (newdepth != depth) {
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
goto out;
- }
- eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
+
+ /* Shift the start of ex by 'map_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + map_len);
+ ext4_ext_store_pblock(ex, ee_pblk + map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
+ }
+ } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+ (map_len < ee_len) && /*L1*/
+ ex < EXT_LAST_EXTENT(eh)) { /*L2*/
+ /* See if we can merge right */
+ ext4_lblk_t next_lblk;
+ ext4_fsblk_t next_pblk, ee_pblk;
+ unsigned int next_len;
+
+ abut_ex = ex + 1;
+ next_lblk = le32_to_cpu(abut_ex->ee_block);
+ next_len = ext4_ext_get_actual_len(abut_ex);
+ next_pblk = ext4_ext_pblock(abut_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+
+ /*
+ * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+ * upon those conditions:
+ * - C1: abut_ex is initialized,
+ * - C2: abut_ex is logically abutting ex,
+ * - C3: abut_ex is physically abutting ex,
+ * - C4: abut_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
+ ((map->m_lblk + map_len) == next_lblk) && /*C2*/
+ ((ee_pblk + ee_len) == next_pblk) && /*C3*/
+ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, abut_ex);
+
+ /* Shift the start of abut_ex by 'map_len' blocks */
+ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+ ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+ ex->ee_len = cpu_to_le16(ee_len - map_len);
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+ /* Extend abut_ex by 'map_len' blocks */
+ abut_ex->ee_len = cpu_to_le16(next_len + map_len);
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = map_len;
}
- allocated = max_blocks;
}
+ if (allocated) {
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = abut_ex;
+ goto out;
+ } else
+ allocated = ee_len - (map->m_lblk - ee_block);
+
+ WARN_ON(map->m_lblk < ee_block);
/*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
*/
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /* ex2: iblock to iblock + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(iblock);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- if (ex2 != ex)
- goto insert;
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ max_zeroout = sbi->s_extent_max_zeroout_kb >>
+ (inode->i_sb->s_blocksize_bits - 10);
+
+ /* If extent is less than s_max_zeroout_kb, zeroout directly */
+ if (max_zeroout && (ee_len <= max_zeroout)) {
+ err = ext4_ext_zeroout(inode, ex);
+ if (err)
+ goto out;
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
+ ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ goto out;
+ }
+
/*
- * New (initialized) extent starts from the first block
- * in the current extent. i.e., ex2 == ex
- * We have to see if it can be merged with the extent
- * on the left.
+ * four cases:
+ * 1. split the extent into three extents.
+ * 2. split the extent into two extents, zeroout the first half.
+ * 3. split the extent into two extents, zeroout the second half.
+ * 4. split the extent into two extents with out zeroout.
*/
- if (ex2 > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex2 - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = map->m_len;
+
+ if (max_zeroout && (allocated > map->m_len)) {
+ if (allocated <= max_zeroout) {
+ /* case 3 */
+ zero_ex.ee_block =
+ cpu_to_le32(map->m_lblk);
+ zero_ex.ee_len = cpu_to_le16(allocated);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex);
if (err)
goto out;
- depth = ext_depth(inode);
- ex2--;
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = allocated;
+ } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
+ /* case 2 */
+ if (map->m_lblk != ee_block) {
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ ee_block);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ }
+
+ split_map.m_lblk = ee_block;
+ split_map.m_len = map->m_lblk - ee_block + map->m_len;
+ allocated = map->m_len;
}
}
+
+ allocated = ext4_split_extent(handle, inode, path,
+ &split_map, split_flag, flags);
+ if (allocated < 0)
+ err = allocated;
+
+out:
+ /* If we have gotten a failure, don't zero out status tree */
+ if (!err)
+ err = ext4_zeroout_es(inode, &zero_ex);
+ return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an unwritten extent.
+ *
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be unwritten
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of unwritten extent to be written on success.
+ */
+static int ext4_split_convert_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path,
+ int flags)
+{
+ ext4_lblk_t eof_block;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len;
+ int split_flag = 0, depth;
+
+ ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+ __func__, inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
/*
- * Try to Merge towards right. This might be required
- * only when the whole extent is being written to.
- * i.e. ex2 == ex and ex3 == NULL.
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
*/
- if (!ex3) {
- ret = ext4_ext_try_to_merge(inode, path, ex2);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ /* Convert to unwritten */
+ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ split_flag |= EXT4_EXT_DATA_VALID1;
+ /* Convert to initialized */
+ } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ split_flag |= ee_block + ee_len <= eof_block ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+ }
+ flags |= EXT4_GET_BLOCKS_PRE_IO;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+}
+
+static int ext4_convert_initialized_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("%s: inode %lu, logical"
+ "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as unwritten */
+ ext4_ext_mark_unwritten(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ /* If extent is larger than requested it is a clear sign that we still
+ * have some extent state machine issues left. So extent_split is still
+ * required.
+ * TODO: Once all related issues will be fixed this situation should be
+ * illegal.
+ */
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+ ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+ " len %u; IO logical block %llu, len %u\n",
+ inode->i_ino, (unsigned long long)ee_block, ee_len,
+ (unsigned long long)map->m_lblk, map->m_len);
+#endif
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
}
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
}
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+ sector_t block, int count)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ unmap_underlying_metadata(bdev, block + i);
+}
+
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+ ext4_lblk_t lblk,
+ struct ext4_ext_path *path,
+ unsigned int len)
+{
+ int i, depth;
+ struct ext4_extent_header *eh;
+ struct ext4_extent *last_ex;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+ return 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+
+ /*
+ * We're going to remove EOFBLOCKS_FL entirely in future so we
+ * do not care for this case anymore. Simply remove the flag
+ * if there are no extents.
+ */
+ if (unlikely(!eh->eh_entries))
+ goto out;
+ last_ex = EXT_LAST_EXTENT(eh);
+ /*
+ * We should clear the EOFBLOCKS_FL flag if we are writing the
+ * last block in the last extent in the file. We test this by
+ * first checking to see if the caller to
+ * ext4_ext_get_blocks() was interested in the last block (or
+ * a block beyond the last block) in the current extent. If
+ * this turns out to be false, we can bail out from this
+ * function immediately.
+ */
+ if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex))
+ return 0;
+ /*
+ * If the caller does appear to be planning to write at or
+ * beyond the end of the current extent, we then test to see
+ * if the current extent is the last extent in the file, by
+ * checking to make sure it was reached via the rightmost node
+ * at each level of the tree.
+ */
+ for (i = depth-1; i >= 0; i--)
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ return 0;
out:
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ return ext4_mark_inode_dirty(handle, inode);
+}
+
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
+ */
+int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end)
+{
+ struct extent_status es;
+
+ ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
+ if (es.es_len == 0)
+ return 0; /* there is no delay extent in this tree */
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
+ return 1;
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+ return 1;
+ else
+ return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ * '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ * are not delalloc'ed.
+ * Ex:
+ * |----c---=|====c====|====c====|===-c----|
+ * |++++++ allocated ++++++|
+ * ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ * marked for delayed allocation. In this case, we will exclude that
+ * cluster.
+ * Ex:
+ * |----====c========|========c========|
+ * |++++++ allocated ++++++|
+ * ==> 1 complete clusters in above example
+ *
+ * Ex:
+ * |================c================|
+ * |++++++ allocated ++++++|
+ * ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+ unsigned int num_blks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+ ext4_lblk_t lblk_from, lblk_to, c_offset;
+ unsigned int allocated_clusters = 0;
+
+ alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+ alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+ /* max possible clusters for this allocation */
+ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+ trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+ /* Check towards left side */
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
+ if (c_offset) {
+ lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
+ lblk_to = lblk_from + c_offset - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+ allocated_clusters--;
+ }
+
+ /* Now check towards right. */
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
+ if (allocated_clusters && c_offset) {
+ lblk_from = lblk_start + num_blks;
+ lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+ allocated_clusters--;
+ }
+
+ return allocated_clusters;
+}
+
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+
+ /*
+ * Make sure that the extent is no bigger than we support with
+ * unwritten extent
+ */
+ if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+ map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+ ret = ext4_convert_initialized_extents(handle, inode, map,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+
+ return err ? err : allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
+
+ ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
+ "block %llu, max_blocks %u, flags %x, allocated %u\n",
+ inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ /*
+ * When writing into unwritten space, we should not fail to
+ * allocate metadata blocks for the new extent block if needed.
+ */
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
+ trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
+ allocated, newblock);
+
+ /* get_block() before submit the IO, split the extent */
+ if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ ret = ext4_split_convert_extents(handle, inode, map,
+ path, flags | EXT4_GET_BLOCKS_CONVERT);
+ if (ret <= 0)
+ goto out;
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to conversion to written when IO is
+ * completed
+ */
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ goto out;
+ }
+ /* IO end_io complete, convert the filled extent to written */
+ if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ goto map_out;
+ }
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ goto out1;
+ }
+
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ map->m_flags |= EXT4_MAP_NEW;
+ /*
+ * if we allocated more blocks than requested
+ * we need to make sure we unmap the extra block
+ * allocated. The actual needed block will get
+ * unmapped later when we find the buffer_head marked
+ * new.
+ */
+ if (allocated > map->m_len) {
+ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+ newblock + map->m_len,
+ allocated - map->m_len);
+ allocated = map->m_len;
+ }
+ map->m_len = allocated;
+
+ /*
+ * If we have done fallocate with the offset that is already
+ * delayed allocated, we would have block reservation
+ * and quota reservation done in the delayed write path.
+ * But fallocate would have already updated quota and block
+ * count for this offset. So cancel these reservation
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, map->m_len);
+ if (reserved_clusters)
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters,
+ 0);
+ }
+
+map_out:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+ map->m_len);
+ if (err < 0)
+ goto out2;
+ }
+out1:
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ ext4_ext_show_leaf(inode, path);
+ map->m_pblk = newblock;
+ map->m_len = allocated;
+out2:
return err ? err : allocated;
}
/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ * @sb The filesystem superblock structure
+ * @map The requested lblk->pblk mapping
+ * @ex The extent structure which might contain an implied
+ * cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree. Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree. There are three cases we
+ * want to catch. The first is this case:
+ *
+ * |--- cluster # N--|
+ * |--- extent ---| |---- requested region ---|
+ * |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ * |--------- cluster # N ----------------|
+ * |--- requested region --| |------- extent ----|
+ * |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region. Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+ struct ext4_map_blocks *map,
+ struct ext4_extent *ex,
+ struct ext4_ext_path *path)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+ ext4_lblk_t ex_cluster_start, ex_cluster_end;
+ ext4_lblk_t rr_cluster_start;
+ ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+ /* The extent passed in that we are trying to match */
+ ex_cluster_start = EXT4_B2C(sbi, ee_block);
+ ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+ /* The requested region passed into ext4_map_blocks() */
+ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+
+ if ((rr_cluster_start == ex_cluster_end) ||
+ (rr_cluster_start == ex_cluster_start)) {
+ if (rr_cluster_start == ex_cluster_end)
+ ee_start += ee_len - 1;
+ map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
+ map->m_len = min(map->m_len,
+ (unsigned) sbi->s_cluster_ratio - c_offset);
+ /*
+ * Check for and handle this case:
+ *
+ * |--------- cluster # N-------------|
+ * |------- extent ----|
+ * |--- requested region ---|
+ * |===========|
+ */
+
+ if (map->m_lblk < ee_block)
+ map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+ /*
+ * Check for the case where there is already another allocated
+ * block to the right of 'ex' but before the end of the cluster.
+ *
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ */
+ if (map->m_lblk > ee_block) {
+ ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+ map->m_len = min(map->m_len, next - map->m_lblk);
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+ return 1;
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+ return 0;
+}
+
+
+/*
* Block allocation/map/preallocation routine for extents based files
*
*
@@ -2304,51 +4258,27 @@ out:
*
* return < 0, error case.
*/
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock,
- unsigned long max_blocks, struct buffer_head *bh_result,
- int create, int extend_disksize)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent_header *eh;
- struct ext4_extent newex, *ex;
- ext4_fsblk_t goal, newblock;
- int err = 0, depth, ret;
- unsigned long allocated = 0;
+ struct ext4_extent newex, *ex, *ex2;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_fsblk_t newblock = 0;
+ int free_on_err = 0, err = 0, depth, ret;
+ unsigned int allocated = 0, offset = 0;
+ unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
+ ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
- __clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%lu requested for inode %u\n",
- iblock, max_blocks, inode->i_ino);
-
- /* check in cache */
- goal = ext4_ext_in_cache(inode, iblock, &newex);
- if (goal) {
- if (goal == EXT4_EXT_CACHE_GAP) {
- if (!create) {
- /*
- * block isn't allocated yet and
- * user doesn't want to allocate it
- */
- goto out2;
- }
- /* we should allocate requested block */
- } else if (goal == EXT4_EXT_CACHE_EXTENT) {
- /* block is already allocated */
- newblock = iblock
- - le32_to_cpu(newex.ee_block)
- + ext_pblock(&newex);
- /* number of remaining blocks in the extent */
- allocated = ext4_ext_get_actual_len(&newex) -
- (iblock - le32_to_cpu(newex.ee_block));
- goto out;
- } else {
- BUG();
- }
- }
+ ext_debug("blocks %u/%u requested for inode %lu\n",
+ map->m_lblk, map->m_len, inode->i_ino);
+ trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, iblock, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -2362,201 +4292,354 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* this situation is possible, though, _during_ tree modification;
* this is why assert can't be put in ext4_ext_find_extent()
*/
- BUG_ON(path[depth].p_ext == NULL && depth != 0);
- eh = path[depth].p_hdr;
+ if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+ EXT4_ERROR_INODE(inode, "bad extent address "
+ "lblock: %lu, depth: %d pblock %lld",
+ (unsigned long) map->m_lblk, depth,
+ path[depth].p_block);
+ err = -EIO;
+ goto out2;
+ }
ex = path[depth].p_ext;
if (ex) {
ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
- ext4_fsblk_t ee_start = ext_pblock(ex);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len;
+
/*
- * Uninitialized extents are treated as holes, except that
+ * unwritten extents are treated as holes, except that
* we split out initialized portions during a write.
*/
ee_len = ext4_ext_get_actual_len(ex);
+
+ trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
/* if found extent covers block, simply return it */
- if (iblock >= ee_block && iblock < ee_block + ee_len) {
- newblock = iblock - ee_block + ee_start;
+ if (in_range(map->m_lblk, ee_block, ee_len)) {
+ newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
- allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
- ee_block, ee_len, newblock);
-
- /* Do not put uninitialized extent in the cache */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start,
- EXT4_EXT_CACHE_EXTENT);
- goto out;
- }
- if (create == EXT4_CREATE_UNINITIALIZED_EXT)
- goto out;
- if (!create)
+ allocated = ee_len - (map->m_lblk - ee_block);
+ ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+ ee_block, ee_len, newblock);
+
+ /*
+ * If the extent is initialized check whether the
+ * caller wants to convert it to unwritten.
+ */
+ if ((!ext4_ext_is_unwritten(ex)) &&
+ (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+ allocated = ext4_ext_convert_initialized_extent(
+ handle, inode, map, path, flags,
+ allocated, newblock);
goto out2;
+ } else if (!ext4_ext_is_unwritten(ex))
+ goto out;
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
+ ret = ext4_ext_handle_unwritten_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ if (ret < 0)
err = ret;
- goto out2;
- } else
+ else
allocated = ret;
- goto outnew;
+ goto out2;
}
}
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
*/
- if (!create) {
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, iblock);
+ if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
+
/*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary.
+ * Okay, we need to do block allocation.
*/
- if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
- ext4_init_block_alloc_info(inode);
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+
+ /*
+ * If we are doing bigalloc, check to see if the extent returned
+ * by ext4_ext_find_extent() implies a cluster we can use.
+ */
+ if (cluster_offset && ex &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
/* find neighbour allocated blocks */
- ar.lleft = iblock;
+ ar.lleft = map->m_lblk;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
goto out2;
- ar.lright = iblock;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+ ar.lright = map->m_lblk;
+ ex2 = NULL;
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
if (err)
goto out2;
+ /* Check if the extent after searching to the right implies a
+ * cluster we can use. */
+ if ((sbi->s_cluster_ratio > 1) && ex2 &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
+
/*
* See if request is beyond maximum number of blocks we can have in
* a single extent. For an initialized extent this limit is
- * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
- * EXT_UNINIT_MAX_LEN.
+ * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+ * EXT_UNWRITTEN_MAX_LEN.
*/
- if (max_blocks > EXT_INIT_MAX_LEN &&
- create != EXT4_CREATE_UNINITIALIZED_EXT)
- max_blocks = EXT_INIT_MAX_LEN;
- else if (max_blocks > EXT_UNINIT_MAX_LEN &&
- create == EXT4_CREATE_UNINITIALIZED_EXT)
- max_blocks = EXT_UNINIT_MAX_LEN;
-
- /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
- newex.ee_block = cpu_to_le32(iblock);
- newex.ee_len = cpu_to_le16(max_blocks);
- err = ext4_ext_check_overlap(inode, &newex, path);
+ if (map->m_len > EXT_INIT_MAX_LEN &&
+ !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+ map->m_len = EXT_INIT_MAX_LEN;
+ else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+ (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+ map->m_len = EXT_UNWRITTEN_MAX_LEN;
+
+ /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+ newex.ee_len = cpu_to_le16(map->m_len);
+ err = ext4_ext_check_overlap(sbi, inode, &newex, path);
if (err)
allocated = ext4_ext_get_actual_len(&newex);
else
- allocated = max_blocks;
+ allocated = map->m_len;
/* allocate new block */
ar.inode = inode;
- ar.goal = ext4_ext_find_goal(inode, path, iblock);
- ar.logical = iblock;
- ar.len = allocated;
+ ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+ ar.logical = map->m_lblk;
+ /*
+ * We calculate the offset from the beginning of the cluster
+ * for the logical block number, since when we allocate a
+ * physical cluster, the physical block should start at the
+ * same offset from the beginning of the cluster. This is
+ * needed so that future calls to get_implied_cluster_alloc()
+ * work correctly.
+ */
+ offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+ ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+ ar.goal -= offset;
+ ar.logical -= offset;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
- goal, newblock, allocated);
-
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
+ ar.goal, newblock, allocated);
+ free_on_err = 1;
+ allocated_clusters = ar.len;
+ ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ if (ar.len > allocated)
+ ar.len = allocated;
+
+got_allocated_blocks:
/* try to insert new extent into found leaf and return */
- ext4_ext_store_pblock(&newex, newblock);
+ ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
- if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
- ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
- if (err) {
+ /* Mark unwritten */
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ ext4_ext_mark_unwritten(&newex);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ /*
+ * io_end structure was created for every IO write to an
+ * unwritten extent. To avoid unnecessary conversion,
+ * here we flag the IO that really needs the conversion.
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform conversion when IO is done.
+ */
+ if (flags & EXT4_GET_BLOCKS_PRE_IO)
+ set_unwritten = 1;
+ }
+
+ err = 0;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, ar.len);
+ if (!err)
+ err = ext4_ext_insert_extent(handle, inode, path,
+ &newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
+ if (err && free_on_err) {
+ int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+ EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
- ext4_mb_discard_inode_preallocations(inode);
- ext4_free_blocks(handle, inode, ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), 0);
+ ext4_discard_preallocations(inode);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
- newblock = ext_pblock(&newex);
+ newblock = ext4_ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_flags |= EXT4_MAP_NEW;
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ /*
+ * Check how many clusters we had reserved this allocated range
+ */
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, allocated);
+ if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (reserved_clusters) {
+ /*
+ * We have clusters reserved for this range.
+ * But since we are not doing actual allocation
+ * and are simply using blocks from previously
+ * allocated cluster, we should release the
+ * reservation and not claim quota.
+ */
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters, 0);
+ }
+ } else {
+ BUG_ON(allocated_clusters < reserved_clusters);
+ if (reserved_clusters < allocated_clusters) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int reservation = allocated_clusters -
+ reserved_clusters;
+ /*
+ * It seems we claimed few clusters outside of
+ * the range of this allocation. We should give
+ * it back to the reservation pool. This can
+ * happen in the following case:
+ *
+ * * Suppose s_cluster_ratio is 4 (i.e., each
+ * cluster has 4 blocks. Thus, the clusters
+ * are [0-3],[4-7],[8-11]...
+ * * First comes delayed allocation write for
+ * logical blocks 10 & 11. Since there were no
+ * previous delayed allocated blocks in the
+ * range [8-11], we would reserve 1 cluster
+ * for this write.
+ * * Next comes write for logical blocks 3 to 8.
+ * In this case, we will reserve 2 clusters
+ * (for [0-3] and [4-7]; and not for [8-11] as
+ * that range has a delayed allocated blocks.
+ * Thus total reserved clusters now becomes 3.
+ * * Now, during the delayed allocation writeout
+ * time, we will first write blocks [3-8] and
+ * allocate 3 clusters for writing these
+ * blocks. Also, we would claim all these
+ * three clusters above.
+ * * Now when we come here to writeout the
+ * blocks [10-11], we would expect to claim
+ * the reservation of 1 cluster we had made
+ * (and we would claim it since there are no
+ * more delayed allocated blocks in the range
+ * [8-11]. But our reserved cluster count had
+ * already gone to 0.
+ *
+ * Thus, at the step 4 above when we determine
+ * that there are still some unwritten delayed
+ * allocated blocks outside of our current
+ * block range, we should increment the
+ * reserved clusters count so that when the
+ * remaining blocks finally gets written, we
+ * could claim them.
+ */
+ dquot_reserve_block(inode,
+ EXT4_C2B(sbi, reservation));
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks += reservation;
+ spin_unlock(&ei->i_block_reservation_lock);
+ }
+ /*
+ * We will claim quota for all newly allocated blocks.
+ * We're updating the reserved space *after* the
+ * correction above so we do not accidentally free
+ * all the metadata reservation because we might
+ * actually need it later on.
+ */
+ ext4_da_update_reserve_space(inode, allocated_clusters,
+ 1);
+ }
+ }
- /* Cache only when it is _not_ an uninitialized extent */
- if (create != EXT4_CREATE_UNINITIALIZED_EXT)
- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an unwritten extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out:
- if (allocated > max_blocks)
- allocated = max_blocks;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
out2:
if (path) {
ext4_ext_drop_refs(path);
kfree(path);
}
+
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+ ext4_es_lru_add(inode);
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
- handle_t *handle;
int err = 0;
/*
- * probably first extent we're gonna free will be last in block
- */
- err = ext4_writepage_trans_blocks(inode) + 3;
- handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
- return;
- }
-
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
-
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
-
- ext4_mb_discard_inode_preallocations(inode);
-
- /*
* TODO: optimization is possible here.
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2564,170 +4647,866 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block);
+retry:
+ err = ext4_es_remove_extent(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ return;
+ }
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ ext4_std_error(inode->i_sb, err);
+}
- /* In a multi-transaction truncate, we only make the final
- * transaction synchronous.
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+ ext4_lblk_t len, int flags, int mode)
+{
+ struct inode *inode = file_inode(file);
+ handle_t *handle;
+ int ret = 0;
+ int ret2 = 0;
+ int retries = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits;
+
+ map.m_lblk = offset;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
*/
- if (IS_SYNC(inode))
- handle->h_sync = 1;
+ if (len <= EXT_UNWRITTEN_MAX_LEN)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-out_stop:
/*
- * If this was a simple ftruncate() and the file will remain alive,
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
- * orphan info for us.
+ * credits to insert 1 extent into extent tree
*/
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
+ credits = ext4_chunk_trans_blocks(inode, len);
- up_write(&EXT4_I(inode)->i_data_sem);
- ext4_journal_stop(handle);
+retry:
+ while (ret >= 0 && ret < len) {
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = len = len - ret;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret <= 0) {
+ ext4_debug("inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ break;
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (ret2)
+ break;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+
+ return ret > 0 ? ret2 : ret;
}
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
+static long ext4_zero_range(struct file *file, loff_t offset,
+ loff_t len, int mode)
{
- int needed;
+ struct inode *inode = file_inode(file);
+ handle_t *handle = NULL;
+ unsigned int max_blocks;
+ loff_t new_size = 0;
+ int ret = 0;
+ int flags;
+ int partial;
+ loff_t start, end;
+ ext4_lblk_t lblk;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned int blkbits = inode->i_blkbits;
- needed = ext4_ext_calc_credits_for_insert(inode, NULL);
+ trace_ext4_zero_range(inode, offset, len, mode);
- /* caller wants to allocate num blocks, but note it includes sb */
- needed = needed * num - (num - 1);
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
-#ifdef CONFIG_QUOTA
- needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + len - 1);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Round up offset. This is not fallocate, we neet to zero out
+ * blocks, so convert interior block aligned part of the range to
+ * unwritten and possibly manually zero out unaligned parts of the
+ * range.
+ */
+ start = round_up(offset, 1 << blkbits);
+ end = round_down((offset + len), 1 << blkbits);
+
+ if (start < offset || end > offset + len)
+ return -EINVAL;
+ partial = (offset + len) & ((1 << blkbits) - 1);
+
+ lblk = start >> blkbits;
+ max_blocks = (end >> blkbits);
+ if (max_blocks < lblk)
+ max_blocks = 0;
+ else
+ max_blocks -= lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Indirect files do not support unwritten extnets
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out_mutex;
+ /*
+ * If we have a partial block after EOF we have to allocate
+ * the entire block.
+ */
+ if (partial)
+ max_blocks += 1;
+ }
+
+ if (max_blocks > 0) {
+
+ /* Now release the pages and zero block aligned part of pages*/
+ truncate_pagecache_range(inode, start, end - 1);
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Remove entire range from the extent status tree.
+ */
+ ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+ if (ret)
+ goto out_dio;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+ mode);
+ if (ret)
+ goto out_dio;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, ret);
+ goto out_dio;
+ }
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
+ if (new_size > i_size_read(inode))
+ i_size_write(inode, new_size);
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
- return needed;
+ ext4_mark_inode_dirty(handle, inode);
+
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
* operation, which gets called from sys_fallocate system call.
* For block-mapped files, posix_fallocate should fall back to the method
* of writing zeroes to the required new blocks (the same behavior which is
* expected for file systems which do not support fallocate() system call).
*/
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
handle_t *handle;
- ext4_lblk_t block;
- unsigned long max_blocks;
- ext4_fsblk_t nblocks = 0;
+ loff_t new_size = 0;
+ unsigned int max_blocks;
int ret = 0;
- int ret2 = 0;
- int retries = 0;
- struct buffer_head map_bh;
- unsigned int credits, blkbits = inode->i_blkbits;
+ int flags;
+ ext4_lblk_t lblk;
+ struct timespec tv;
+ unsigned int blkbits = inode->i_blkbits;
+
+ /* Return error if mode is not supported */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return ext4_punch_hole(inode, offset, len);
+
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ return ret;
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
- /* preallocation to directories is currently not supported */
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
- block = offset >> blkbits;
- max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - block;
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ return ext4_zero_range(file, offset, len, mode);
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ lblk = offset >> blkbits;
/*
- * credits to insert 1 extent into extent tree + buffers to be able to
- * modify 1 super block, 1 block bitmap and 1 group descriptor.
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
*/
- credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
mutex_lock(&inode->i_mutex);
-retry:
- while (ret >= 0 && ret < max_blocks) {
- block = block + ret;
- max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ if (ret)
+ goto out;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ goto out;
+
+ tv = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
+ if (new_size > i_size_read(inode)) {
+ i_size_write(inode, new_size);
+ inode->i_mtime = tv;
}
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
- ret = ext4_get_blocks_wrap(handle, inode, block,
- max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
- if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_get_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%lu", __func__,
- inode->i_ino, block, max_blocks);
-#endif
- ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+out:
+ mutex_unlock(&inode->i_mutex);
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+ return ret;
+}
+
+/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ map.m_lblk = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+ map.m_lblk);
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ }
+ ret = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ if (ret <= 0)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ ext4_mark_inode_dirty(handle, inode);
+ if (credits)
ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
+ }
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newes unmodified.
+ */
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct extent_status *newes)
+{
+ struct extent_status es;
+ ext4_lblk_t block, next_del;
+
+ if (newes->es_pblk == 0) {
+ ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1, &es);
+
+ /*
+ * No extent in extent-tree contains block @newes->es_pblk,
+ * then the block may stay in 1)a hole or 2)delayed-extent.
+ */
+ if (es.es_len == 0)
+ /* A hole found. */
+ return 0;
+
+ if (es.es_lblk > newes->es_lblk) {
+ /* A hole found. */
+ newes->es_len = min(es.es_lblk - newes->es_lblk,
+ newes->es_len);
+ return 0;
}
- if (ret > 0) {
- /* check wrap through sign-bit/zero here */
- if ((block + ret) < 0 || (block + ret) < block) {
- ret = -EIO;
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- break;
+
+ newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
+ }
+
+ block = newes->es_lblk + newes->es_len;
+ ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+ if (es.es_len == 0)
+ next_del = EXT_MAX_BLOCKS;
+ else
+ next_del = es.es_lblk;
+
+ return next_del;
+}
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+static int ext4_xattr_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_LAST;
+ int blockbits = inode->i_sb->s_blocksize_bits;
+ int error = 0;
+
+ /* in-inode? */
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ struct ext4_iloc iloc;
+ int offset; /* offset of xattr in inode */
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
+ offset = EXT4_GOOD_OLD_INODE_SIZE +
+ EXT4_I(inode)->i_extra_isize;
+ physical += offset;
+ length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ brelse(iloc.bh);
+ } else { /* external block */
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
+ length = inode->i_sb->s_blocksize;
+ }
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ ext4_lblk_t start_blk;
+ int error = 0;
+
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+ if (has_inline)
+ return error;
+ }
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ }
+
+ /* fallback to generic here if not in extents fmt */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
+
+ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ return -EBADR;
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ error = ext4_xattr_fiemap(inode, fieinfo);
+ } else {
+ ext4_lblk_t len_blks;
+ __u64 last_blk;
+
+ start_blk = start >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+ /*
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
+ */
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
+ }
+ ext4_es_lru_add(inode);
+ return error;
+}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int credits, err;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ /*
+ * Check if need to extend journal credits
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups
+ */
+ if (handle->h_buffer_credits < 7) {
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ /* EAGAIN is success */
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path);
+ return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+ struct inode *inode, handle_t *handle,
+ ext4_lblk_t *start)
+{
+ int depth, err = 0;
+ struct ext4_extent *ex_start, *ex_last;
+ bool update = 0;
+ depth = path->p_depth;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ ex_start = path[depth].p_ext;
+ if (!ex_start)
+ return -EIO;
+
+ ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ if (!ex_last)
+ return -EIO;
+
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+ update = 1;
+
+ *start = le32_to_cpu(ex_last->ee_block) +
+ ext4_ext_get_actual_len(ex_last);
+
+ while (ex_start <= ex_last) {
+ le32_add_cpu(&ex_start->ee_block, -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
}
- if (buffer_new(&map_bh) && ((block + ret) >
- (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
- >> blkbits)))
- nblocks = nblocks + ret;
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (--depth < 0 || !update)
+ break;
}
- /* Update ctime if new blocks get allocated */
- if (nblocks) {
- struct timespec now;
+ /* Update index too */
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
- }
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
+ /* we are done if current index is not a starting index */
+ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
break;
+
+ depth--;
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+ ext4_lblk_t start, ext4_lblk_t shift)
+{
+ struct ext4_ext_path *path;
+ int ret = 0, depth;
+ struct ext4_extent *extent;
+ ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t ex_start, ex_end;
+
+ /* Let path point to the last extent */
+ path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+ }
+
+ stop_block = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Nothing to shift, if hole is at the end of file */
+ if (start >= stop_block)
+ return ret;
/*
- * Time to update the file size.
- * Update only when preallocation was requested beyond the file size.
+ * Don't start shifting extents until we make sure the hole is big
+ * enough to accomodate the shift.
*/
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len) > i_size_read(inode)) {
- if (ret > 0) {
- /*
- * if no error, we assume preallocation succeeded
- * completely
- */
- i_size_write(inode, offset + len);
- EXT4_I(inode)->i_disksize = i_size_read(inode);
- } else if (ret < 0 && nblocks) {
- /* Handle partial allocation scenario */
- loff_t newsize;
+ path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if ((start == ex_start && shift > ex_start) ||
+ (shift > start - ex_end))
+ return -EINVAL;
+
+ /* Its safe to start updating extents */
+ while (start < stop_block) {
+ path = ext4_ext_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) start);
+ return -EIO;
+ }
- newsize = (nblocks << blkbits) + i_size_read(inode);
- i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
- EXT4_I(inode)->i_disksize = i_size_read(inode);
+ current_block = le32_to_cpu(extent->ee_block);
+ if (start > current_block) {
+ /* Hole, move to the next extent */
+ ret = mext_next_extent(inode, path, &extent);
+ if (ret != 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
}
+ ret = ext4_ext_shift_path_extents(path, shift, inode,
+ handle, &start);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t punch_start, punch_stop;
+ handle_t *handle;
+ unsigned int credits;
+ loff_t new_size, ioffset;
+ int ret;
+
+ /* Collapse range works only on fs block size aligned offsets. */
+ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ len & (EXT4_BLOCK_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ trace_ext4_collapse_range(inode, offset, len);
+
+ punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case
+ * it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ ret = -EINVAL;
+ goto out_mutex;
+ }
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ truncate_pagecache(inode, ioffset);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
}
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ new_size = i_size_read(inode) - len;
+ i_size_write(inode, new_size);
+ EXT4_I(inode)->i_disksize = new_size;
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
mutex_unlock(&inode->i_mutex);
- return ret > 0 ? ret2 : ret;
+ return ret;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 00000000000..0b7e28e7eaa
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,1127 @@
+/*
+ * fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Hugh Dickins <hughd@google.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include <linux/list_sort.h>
+#include "ext4.h"
+#include "extents_status.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal. It is
+ * original built by Yongqiang Yang. At that time it is called delay
+ * extent tree, whose goal is only track delayed extents in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
+ *
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
+ * invalidated. Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory. written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure. Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
+ */
+
+/*
+ * Extent status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extent status tree tracks all extent status.
+ *
+ * 1. Why we need to implement extent status tree?
+ *
+ * Without extent status tree, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
+ *
+ * Let us have a look at how they do without extent status tree.
+ * -- FIEMAP
+ * FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ * -- SEEK_HOLE/DATA
+ * SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ * -- bigalloc
+ * bigalloc looks up page cache to figure out if a block is
+ * already under delayed allocation or not to determine whether
+ * quota reserving is needed for the cluster.
+ *
+ * -- writeout
+ * Writeout looks up whole page cache to see if a buffer is
+ * mapped, If there are not very many delayed buffers, then it is
+ * time comsuming.
+ *
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
+ *
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
+ *
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
+ *
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
+ *
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten/hole extents from
+ * the tree under a heavy memory pressure.
+ *
+ *
+ * ==========================================================================
+ * 3. Performance analysis
+ *
+ * -- overhead
+ * 1. There is a cache extent for write access, so if writes are
+ * not very random, adding space operaions are in O(1) time.
+ *
+ * -- gain
+ * 2. Code is much simpler, more readable, more maintainable and
+ * more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *
+ * -- Refactor delayed space reservation
+ *
+ * -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
+
+int __init ext4_init_es(void)
+{
+ ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+ sizeof(struct extent_status),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ if (ext4_es_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_es(void)
+{
+ if (ext4_es_cachep)
+ kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+ tree->root = RB_ROOT;
+ tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_es_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ struct extent_status *es;
+ es = rb_entry(node, struct extent_status, rb_node);
+ printk(KERN_DEBUG " [%u/%u) %llu %x",
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
+{
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset. If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+ ext4_lblk_t lblk)
+{
+ struct rb_node *node = root->rb_node;
+ struct extent_status *es = NULL;
+
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es))
+ node = node->rb_right;
+ else
+ return es;
+ }
+
+ if (es && lblk < es->es_lblk)
+ return es;
+
+ if (es && lblk > ext4_es_end(es)) {
+ node = rb_next(&es->rb_node);
+ return node ? rb_entry(node, struct extent_status, rb_node) :
+ NULL;
+ }
+
+ return NULL;
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ struct ext4_es_tree *tree = NULL;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+
+ BUG_ON(es == NULL);
+ BUG_ON(end < lblk);
+ trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %x\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
+ goto out;
+ }
+ }
+
+ es1 = __es_tree_search(&tree->root, lblk);
+
+out:
+ if (es1 && !ext4_es_is_delayed(es1)) {
+ while ((node = rb_next(&es1->rb_node)) != NULL) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->es_lblk > end) {
+ es1 = NULL;
+ break;
+ }
+ if (ext4_es_is_delayed(es1))
+ break;
+ }
+ }
+
+ if (es1 && ext4_es_is_delayed(es1)) {
+ tree->cache_es = es1;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
+{
+ struct extent_status *es;
+ es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+ if (es == NULL)
+ return NULL;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es)) {
+ EXT4_I(inode)->i_es_lru_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
+ return es;
+}
+
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
+ kmem_cache_free(ext4_es_cachep, es);
+}
+
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (ext4_es_status(es1) != ext4_es_status(es2))
+ return 0;
+
+ if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+ pr_warn("ES assertion failed when merging extents. "
+ "The sum of lengths of es1 (%d) and es2 (%d) "
+ "is bigger than allowed file size (%d)\n",
+ es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+ WARN_ON(1);
+ return 0;
+ }
+
+ if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+ (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+ return 1;
+
+ if (ext4_es_is_hole(es1))
+ return 1;
+
+ /* we need to check delayed extent is without unwritten status */
+ if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ return 1;
+
+ return 0;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ es = es1;
+ }
+
+ return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_next(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
+ rb_erase(node, &tree->root);
+ ext4_es_free_extent(inode, es1);
+ }
+
+ return es;
+}
+
+#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
+
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+ struct extent_status *es)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_start;
+ unsigned short ee_len;
+ int depth, ee_status, es_status;
+
+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ if (ex) {
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_start = ext4_ext_pblock(ex);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
+ es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+ /*
+ * Make sure ex and es are not overlap when we try to insert
+ * a delayed/hole extent.
+ */
+ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+ if (in_range(es->es_lblk, ee_block, ee_len)) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu we can find an extent "
+ "at block [%d/%d/%llu/%c], but we "
+ "want to add a delayed/hole extent "
+ "[%d/%d/%llu/%x]\n",
+ inode->i_ino, ee_block, ee_len,
+ ee_start, ee_status ? 'u' : 'w',
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ }
+ goto out;
+ }
+
+ /*
+ * We don't check ee_block == es->es_lblk, etc. because es
+ * might be a part of whole extent, vice versa.
+ */
+ if (es->es_lblk < ee_block ||
+ ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "ex_status [%d/%d/%llu/%c] != "
+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+ ee_block, ee_len, ee_start,
+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+ ext4_es_pblock(es), es_status ? 'u' : 'w');
+ goto out;
+ }
+
+ if (ee_status ^ es_status) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "ex_status [%d/%d/%llu/%c] != "
+ "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+ ee_block, ee_len, ee_start,
+ ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+ ext4_es_pblock(es), es_status ? 'u' : 'w');
+ }
+ } else {
+ /*
+ * We can't find an extent on disk. So we need to make sure
+ * that we don't want to add an written/unwritten extent.
+ */
+ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "can't find an extent at block %d but we want "
+ "to add a written/unwritten extent "
+ "[%d/%d/%llu/%x]\n", inode->i_ino,
+ es->es_lblk, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ }
+ }
+out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+ struct extent_status *es)
+{
+ struct ext4_map_blocks map;
+ int retval;
+
+ /*
+ * Here we call ext4_ind_map_blocks to lookup a block mapping because
+ * 'Indirect' structure is defined in indirect.c. So we couldn't
+ * access direct/indirect tree from outside. It is too dirty to define
+ * this function in indirect.c file.
+ */
+
+ map.m_lblk = es->es_lblk;
+ map.m_len = es->es_len;
+
+ retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+ if (retval > 0) {
+ if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+ /*
+ * We want to add a delayed/hole extent but this
+ * block has been allocated.
+ */
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "We can find blocks but we want to add a "
+ "delayed/hole extent [%d/%d/%llu/%x]\n",
+ inode->i_ino, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ return;
+ } else if (ext4_es_is_written(es)) {
+ if (retval != es->es_len) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu retval %d != es_len %d\n",
+ inode->i_ino, retval, es->es_len);
+ return;
+ }
+ if (map.m_pblk != ext4_es_pblock(es)) {
+ pr_warn("ES insert assertion failed for "
+ "inode: %lu m_pblk %llu != "
+ "es_pblk %llu\n",
+ inode->i_ino, map.m_pblk,
+ ext4_es_pblock(es));
+ return;
+ }
+ } else {
+ /*
+ * We don't need to check unwritten extent because
+ * indirect-based file doesn't have it.
+ */
+ BUG_ON(1);
+ }
+ } else if (retval == 0) {
+ if (ext4_es_is_written(es)) {
+ pr_warn("ES insert assertion failed for inode: %lu "
+ "We can't find the block but we want to add "
+ "a written extent [%d/%d/%llu/%x]\n",
+ inode->i_ino, es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
+ return;
+ }
+ }
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+ struct extent_status *es)
+{
+ /*
+ * We don't need to worry about the race condition because
+ * caller takes i_data_sem locking.
+ */
+ BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_es_insert_extent_ext_check(inode, es);
+ else
+ ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+ struct extent_status *es)
+{
+}
+#endif
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_status *es;
+
+ while (*p) {
+ parent = *p;
+ es = rb_entry(parent, struct extent_status, rb_node);
+
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ /*
+ * Here we can modify es_lblk directly
+ * because it isn't overlapped.
+ */
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es))
+ ext4_es_store_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
+ goto out;
+ }
+ p = &(*p)->rb_left;
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
+ goto out;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG_ON(1);
+ return -EINVAL;
+ }
+ }
+
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+ if (!es)
+ return -ENOMEM;
+ rb_link_node(&es->rb_node, parent, p);
+ rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+ tree->cache_es = es;
+ return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+ int err = 0;
+
+ es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ if (!len)
+ return 0;
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock_status(&newes, pblk, status);
+ trace_ext4_es_insert_extent(inode, &newes);
+
+ ext4_es_insert_extent_check(inode, &newes);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
+
+error:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+
+ return err;
+}
+
+/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status *es;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock_status(&newes, pblk, status);
+ trace_ext4_es_cache_extent(inode, &newes);
+
+ if (!len)
+ return;
+
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
+ *
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
+ */
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
+{
+ struct ext4_es_tree *tree;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, lblk);
+ es_debug("lookup extent in block %u\n", lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ struct extent_status orig_es;
+ ext4_lblk_t len1, len2;
+ ext4_fsblk_t block;
+ int err;
+
+retry:
+ err = 0;
+ es = __es_tree_search(&tree->root, lblk);
+ if (!es)
+ goto out;
+ if (es->es_lblk > end)
+ goto out;
+
+ /* Simply invalidate cache_es. */
+ tree->cache_es = NULL;
+
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
+ if (len1 > 0)
+ es->es_len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ block = 0x7FDEADBEEFULL;
+ if (ext4_es_is_written(&orig_es) ||
+ ext4_es_is_unwritten(&orig_es))
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock_status(&newes, block,
+ ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
+ if (err) {
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ goto out;
+ }
+ } else {
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es)) {
+ block = orig_es.es_pblk + orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+ }
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&es->rb_node);
+ if (node)
+ es = rb_entry(node, struct extent_status, rb_node);
+ else
+ es = NULL;
+ }
+
+ while (es && ext4_es_end(es) <= end) {
+ node = rb_next(&es->rb_node);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ if (!node) {
+ es = NULL;
+ break;
+ }
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+ block = es->es_pblk + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+ }
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ if (!len)
+ return err;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_print_tree(inode);
+ return err;
+}
+
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
+
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
+{
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skipped);
+ int nr_shrunk = 0;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
+
+ spin_lock(&sbi->s_es_lru_lock);
+
+retry:
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ int shrunk;
+
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ /*
+ * Skip the inode that is newer than the last_sorted
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+ if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+ list_move_tail(cur, &skipped);
+ continue;
+ }
+
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+ !write_trylock(&ei->i_es_lock))
+ continue;
+
+ shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += shrunk;
+ nr_to_scan -= shrunk;
+ if (nr_to_scan == 0)
+ break;
+ }
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skipped, &sbi->s_es_lru);
+ INIT_LIST_HEAD(&skipped);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+ * forward progress, sort the list and try again.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+ * If there are no non-precached inodes left on the
+ * list, start releasing precached extents.
+ */
+ if (ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))
+ skip_precached = 0;
+ goto retry;
+ }
+
+ spin_unlock(&sbi->s_es_lru_lock);
+
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+ return nr_shrunk;
+}
+
+static unsigned long ext4_es_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long nr;
+ struct ext4_sb_info *sbi;
+
+ sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+ return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return nr_shrunk;
+}
+
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
+ sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker.count_objects = ext4_es_count;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+ unregister_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ unsigned long nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 00000000000..f1b62a41992
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,146 @@
+/*
+ * fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+#define ES_SHIFT 60
+
+#define EXTENT_STATUS_WRITTEN (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED (1 << 1)
+#define EXTENT_STATUS_HOLE (1 << 0)
+
+#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
+ EXTENT_STATUS_UNWRITTEN | \
+ EXTENT_STATUS_DELAYED | \
+ EXTENT_STATUS_HOLE)
+
+#define ES_WRITTEN (1ULL << 63)
+#define ES_UNWRITTEN (1ULL << 62)
+#define ES_DELAYED (1ULL << 61)
+#define ES_HOLE (1ULL << 60)
+
+#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+ ES_DELAYED | ES_HOLE)
+
+struct ext4_sb_info;
+struct ext4_extent;
+
+struct extent_status {
+ struct rb_node rb_node;
+ ext4_lblk_t es_lblk; /* first logical block extent covers */
+ ext4_lblk_t es_len; /* length of extent in block */
+ ext4_fsblk_t es_pblk; /* first physical block */
+};
+
+struct ext4_es_tree {
+ struct rb_root root;
+ struct extent_status *cache_es; /* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+ return (es->es_pblk & ES_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+ return (es->es_pblk & ES_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+ return (es->es_pblk & ES_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+ return (es->es_pblk & ES_HOLE) != 0;
+}
+
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+ return es->es_pblk >> ES_SHIFT;
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+ return es->es_pblk & ~ES_MASK;
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+ ext4_fsblk_t pb)
+{
+ ext4_fsblk_t block;
+
+ block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
+ es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (es->es_pblk & ~ES_MASK));
+}
+
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ ext4_fsblk_t pb,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (pb & ~ES_MASK));
+}
+
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac35ec58db5..8695f70af1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,8 +21,13 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/aio.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
@@ -31,14 +36,19 @@
* from ext4_file_open: open gets called at every open, but release
* gets called only when /all/ the files are closed.
*/
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
{
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+ ext4_alloc_da_blocks(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ }
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_reservation(inode);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -47,110 +57,562 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
return 0;
}
+static void ext4_unwritten_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete. Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block. If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+{
+ struct super_block *sb = inode->i_sb;
+ int blockmask = sb->s_blocksize - 1;
+
+ if (pos >= i_size_read(inode))
+ return 0;
+
+ if ((pos | iov_iter_alignment(from)) & blockmask)
+ return 1;
+
+ return 0;
+}
+
static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct mutex *aio_mutex = NULL;
+ struct blk_plug plug;
+ int o_direct = file->f_flags & O_DIRECT;
+ int overwrite = 0;
+ size_t length = iov_iter_count(from);
ssize_t ret;
- int err;
+ loff_t pos = iocb->ki_pos;
+
+ /*
+ * Unaligned direct AIO must be serialized; see comment above
+ * In the case of O_APPEND, assume that we must always serialize
+ */
+ if (o_direct &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) &&
+ (file->f_flags & O_APPEND ||
+ ext4_unaligned_aio(inode, from, pos))) {
+ aio_mutex = ext4_aio_mutex(inode);
+ mutex_lock(aio_mutex);
+ ext4_unwritten_wait(inode);
+ }
+
+ mutex_lock(&inode->i_mutex);
+ if (file->f_flags & O_APPEND)
+ iocb->ki_pos = pos = i_size_read(inode);
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
-
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- size_t length = iov_length(iov, nr_segs);
- if (pos > sbi->s_bitmap_maxbytes)
- return -EFBIG;
+ if ((pos > sbi->s_bitmap_maxbytes) ||
+ (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EFBIG;
+ goto errout;
+ }
+
+ if (pos + length > sbi->s_bitmap_maxbytes)
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
+ }
+
+ if (o_direct) {
+ blk_start_plug(&plug);
+
+ iocb->private = &overwrite;
+
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has
+ * been preallocated no matter they are
+ * initialized or not. For excluding
+ * unwritten extents, we need to check
+ * m_flags. There are two conditions that
+ * indicate for initialized extents. 1) If we
+ * hit extent cache, EXT4_MAP_MAPPED flag is
+ * returned; 2) If we do a real lookup,
+ * non-flags are returned. So we should check
+ * these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
}
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
+ ret = __generic_file_write_iter(iocb, from);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ if (o_direct)
+ blk_finish_plug(&plug);
+
+errout:
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
+ return ret;
+}
+static const struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = ext4_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ return 0;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct vfsmount *mnt = filp->f_path.mnt;
+ struct path path;
+ char buf[64], *cp;
+
+ if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+ !(sb->s_flags & MS_RDONLY))) {
+ sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ /*
+ * Sample where the filesystem has been mounted and
+ * store it in the superblock for sysadmin convenience
+ * when trying to sort through large numbers of block
+ * devices or filesystem images.
+ */
+ memset(buf, 0, sizeof(buf));
+ path.mnt = mnt;
+ path.dentry = mnt->mnt_root;
+ cp = d_path(&path, buf, sizeof(buf));
+ if (!IS_ERR(cp)) {
+ handle_t *handle;
+ int err;
+
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err) {
+ ext4_journal_stop(handle);
+ return err;
+ }
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ ext4_handle_dirty_super(handle, sb);
+ ext4_journal_stop(handle);
+ }
+ }
/*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
+ * Set up the jbd2_inode if we are opening the inode for
+ * writing and the journal is present
*/
- if (file->f_flags & O_SYNC) {
+ if (filp->f_mode & FMODE_WRITE) {
+ int ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ return ret;
+ }
+ return dquot_file_open(inode, filp);
+}
+
+/*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function. When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+ int whence,
+ struct ext4_map_blocks *map,
+ loff_t *offset)
+{
+ struct pagevec pvec;
+ unsigned int blkbits;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff;
+ loff_t lastoff;
+ int found = 0;
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ startoff = *offset;
+ lastoff = startoff;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ do {
+ int i, num;
+ unsigned long nr_pages;
+
+ num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ (pgoff_t)num);
+ if (nr_pages == 0) {
+ if (whence == SEEK_DATA)
+ break;
+
+ BUG_ON(whence != SEEK_HOLE);
+ /*
+ * If this is the first time to go into the loop and
+ * offset is not beyond the end offset, it will be a
+ * hole at this offset
+ */
+ if (lastoff == startoff || lastoff < endoff)
+ found = 1;
+ break;
+ }
+
/*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
+ * If this is the first time to go into the loop and
+ * offset is smaller than the first page offset, it will be a
+ * hole at this offset.
*/
- if (!ext4_should_journal_data(inode))
- return ret;
+ if (lastoff == startoff && whence == SEEK_HOLE &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = 1;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ /*
+ * If the current offset is not beyond the end of given
+ * range, it will be a hole.
+ */
+ if (lastoff < endoff && whence == SEEK_HOLE &&
+ page->index > end) {
+ found = 1;
+ *offset = lastoff;
+ goto out;
+ }
- goto force_commit;
+ lock_page(page);
+
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (page_has_buffers(page)) {
+ lastoff = page_offset(page);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_uptodate(bh) ||
+ buffer_unwritten(bh)) {
+ if (whence == SEEK_DATA)
+ found = 1;
+ } else {
+ if (whence == SEEK_HOLE)
+ found = 1;
+ }
+ if (found) {
+ *offset = max_t(loff_t,
+ startoff, lastoff);
+ unlock_page(page);
+ goto out;
+ }
+ lastoff += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The no. of pages is less than our desired, that would be a
+ * hole in there.
+ */
+ if (nr_pages < num && whence == SEEK_HOLE) {
+ found = 1;
+ *offset = lastoff;
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t dataoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
}
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ dataoff = offset;
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ break;
+ }
-force_commit:
- err = ext4_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
+ /*
+ * If there is a delay extent at this offset,
+ * it will be as a data.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ &map, &dataoff);
+ if (unwritten)
+ break;
+ }
+
+ last++;
+ dataoff = (loff_t)last << blkbits;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (dataoff > isize)
+ return -ENXIO;
+
+ return vfs_setpos(file, dataoff, maxsize);
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t holeoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ holeoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * we will skip this extent.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ last = es.es_lblk + es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ &map, &holeoff);
+ if (!unwritten) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
+ }
+
+ /* find a hole */
+ break;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (holeoff > isize)
+ holeoff = isize;
+
+ return vfs_setpos(file, holeoff, maxsize);
+}
+
+/*
+ * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
+ * by calling generic_file_llseek_size() with the appropriate maxbytes
+ * value for each.
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxbytes;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ else
+ maxbytes = inode->i_sb->s_maxbytes;
+
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ return generic_file_llseek_size(file, offset, whence,
+ maxbytes, i_size_read(inode));
+ case SEEK_DATA:
+ return ext4_seek_data(file, offset, maxbytes);
+ case SEEK_HOLE:
+ return ext4_seek_hole(file, offset, maxbytes);
+ }
+
+ return -EINVAL;
}
const struct file_operations ext4_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
- .ioctl = ext4_ioctl,
+ .llseek = ext4_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
+ .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
- .open = generic_file_open,
+ .mmap = ext4_file_mmap,
+ .open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
- .truncate = ext4_truncate,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .getattr = ext4_getattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .permission = ext4_permission,
- .fallocate = ext4_fallocate,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
+ .fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8d50879d1c2..a8bc47f75fa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,8 +27,51 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include <linux/blkdev.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static int ext4_sync_parent(struct inode *inode)
+{
+ struct dentry *dentry = NULL;
+ struct inode *next;
+ int ret = 0;
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ return 0;
+ inode = igrab(inode);
+ while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ break;
+ next = igrab(dentry->d_parent->d_inode);
+ dput(dentry);
+ if (!next)
+ break;
+ iput(inode);
+ inode = next;
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (ret)
+ break;
+ ret = sync_inode_metadata(inode, 1);
+ if (ret)
+ break;
+ }
+ iput(inode);
+ return ret;
+}
/*
* akpm: A new design for ext4_sync_file().
@@ -42,22 +85,42 @@
* inode to disk.
*/
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = dentry->d_inode;
- int ret = 0;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ int ret = 0, err;
+ tid_t commit_tid;
+ bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
+ trace_ext4_sync_file_enter(file, datasync);
+
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (!journal) {
+ ret = generic_file_fsync(file, start, end, datasync);
+ if (!ret && !hlist_empty(&inode->i_dentry))
+ ret = ext4_sync_parent(inode);
+ goto out;
+ }
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -72,17 +135,17 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
goto out;
}
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- ret = sync_inode(inode, &wbc);
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ ret = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
}
out:
+ trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index 7eb0604e7ee..00000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
- ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc) \
- ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1555024e3b3..3d586f02883 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -11,8 +11,8 @@
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
#include <linux/cryptohash.h>
+#include "ext4.h"
#define DELTA 0x9E3779B9
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
sum += DELTA;
b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
- } while(--n);
+ } while (--n);
buf[0] += b0;
buf[1] += b1;
@@ -35,23 +35,43 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
/* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const unsigned char *ucp = (const unsigned char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const signed char *scp = (const signed char *) name;
+
while (len--) {
- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+ hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
- if (hash & 0x80000000) hash -= 0x7fffffff;
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
- return (hash0 << 1);
+ return hash0 << 1;
}
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
+ const signed char *scp = (const signed char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
@@ -59,10 +79,38 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
val = pad;
if (len > num*4)
len = num * 4;
- for (i=0; i < len; i++) {
+ for (i = 0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
- val = msg[i] + (val << 8);
+ val = ((int) scp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const unsigned char *ucp = (const unsigned char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
const char *p;
int i;
__u32 in[8], buf[4];
+ void (*str2hashbuf)(const char *, int, __u32 *, int) =
+ str2hashbuf_signed;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -104,22 +154,27 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
- for (i=0; i < 4; i++) {
- if (hinfo->seed[i])
+ for (i = 0; i < 4; i++) {
+ if (hinfo->seed[i]) {
+ memcpy(buf, hinfo->seed, sizeof(buf));
break;
+ }
}
- if (i < 4)
- memcpy(buf, hinfo->seed, sizeof(buf));
}
switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY_UNSIGNED:
+ hash = dx_hack_hash_unsigned(name, len);
+ break;
case DX_HASH_LEGACY:
- hash = dx_hack_hash(name, len);
+ hash = dx_hack_hash_signed(name, len);
break;
+ case DX_HASH_HALF_MD4_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 8);
+ (*str2hashbuf)(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
minor_hash = buf[2];
hash = buf[1];
break;
+ case DX_HASH_TEA_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 4);
+ (*str2hashbuf)(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
@@ -143,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT4_HTREE_EOF << 1))
- hash = (EXT4_HTREE_EOF-1) << 1;
+ if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+ hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8036b9b5376..5b87fc36aab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -15,8 +15,6 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
@@ -26,9 +24,12 @@
#include <linux/blkdev.h>
#include <asm/byteorder.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "group.h"
+
+#include <trace/events/ext4.h>
/*
* ialloc.c contains the inodes allocation and deallocation routines
@@ -49,7 +50,7 @@
* need to use it within a single byte (to ensure we get endianness right).
* We can use memset for the rest of the bitmap as there are no other users.
*/
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
{
int i;
@@ -64,33 +65,54 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
}
/* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
+ struct ext4_group_info *grp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
-
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
- if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
- block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
- memset(bh->b_data, 0xff, sb->s_blocksize);
+ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
bh->b_data);
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
return EXT4_INODES_PER_GROUP(sb);
}
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -98,34 +120,93 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
* Return buffer_head of bitmap on success or NULL.
*/
static struct buffer_head *
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
+ ext4_fsblk_t bitmap_blk;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- goto error_out;
+ return NULL;
+
+ bitmap_blk = ext4_inode_bitmap(sb, desc);
+ bh = sb_getblk(sb, bitmap_blk);
+ if (unlikely(!bh)) {
+ ext4_error(sb, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+ if (bitmap_uptodate(bh))
+ goto verify;
+
+ lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ goto verify;
+ }
+
+ ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ext4_init_inode_bitmap(sb, bh, block_group,
- desc);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
+ ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
+ return bh;
+ }
+ ext4_unlock_group(sb, block_group);
+
+ if (buffer_uptodate(bh)) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ goto verify;
+ }
+ /*
+ * submit the buffer_head for reading
+ */
+ trace_ext4_load_inode_bitmap(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ put_bh(bh);
+ ext4_error(sb, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+
+verify:
+ ext4_lock_group(sb, block_group);
+ if (!buffer_verified(bh) &&
+ !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ put_bh(bh);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, bitmap_blk);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
}
- } else {
- bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
- }
- if (!bh)
- ext4_error(sb, "read_inode_bitmap",
- "Cannot read inode bitmap - "
- "block_group = %lu, inode_bitmap = %llu",
- block_group, ext4_inode_bitmap(sb, desc));
-error_out:
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return NULL;
+ }
+ ext4_unlock_group(sb, block_group);
+ set_buffer_verified(bh);
return bh;
}
@@ -145,63 +226,68 @@ error_out:
* though), and then we'd have two inodes sharing the
* same inode number and space on the harddisk.
*/
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
{
- struct super_block * sb = inode->i_sb;
+ struct super_block *sb = inode->i_sb;
int is_directory;
unsigned long ino;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
ext4_group_t block_group;
unsigned long bit;
- struct ext4_group_desc * gdp;
- struct ext4_super_block * es;
+ struct ext4_group_desc *gdp;
+ struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int fatal = 0, err;
+ int fatal = 0, err, count, cleared;
+ struct ext4_group_info *grp;
- if (atomic_read(&inode->i_count) > 1) {
- printk ("ext4_free_inode: inode has count=%d\n",
- atomic_read(&inode->i_count));
+ if (!sb) {
+ printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+ "nonexistent device\n", __func__, __LINE__);
return;
}
- if (inode->i_nlink) {
- printk ("ext4_free_inode: inode has nlink=%d\n",
- inode->i_nlink);
+ if (atomic_read(&inode->i_count) > 1) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+ __func__, __LINE__, inode->i_ino,
+ atomic_read(&inode->i_count));
return;
}
- if (!sb) {
- printk("ext4_free_inode: inode on nonexistent device\n");
+ if (inode->i_nlink) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+ __func__, __LINE__, inode->i_ino, inode->i_nlink);
return;
}
sbi = EXT4_SB(sb);
ino = inode->i_ino;
- ext4_debug ("freeing inode %lu\n", ino);
+ ext4_debug("freeing inode %lu\n", ino);
+ trace_ext4_free_inode(inode);
/*
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- DQUOT_INIT(inode);
+ dquot_initialize(inode);
ext4_xattr_delete_inode(handle, inode);
- DQUOT_FREE_INODE(inode);
- DQUOT_DROP(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
/* Do this BEFORE marking the inode not in use or returning an error */
- clear_inode (inode);
+ ext4_clear_inode(inode);
es = EXT4_SB(sb)->s_es;
if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
- ext4_error (sb, "ext4_free_inode",
- "reserved or nonexistent inode %lu", ino);
+ ext4_error(sb, "reserved or nonexistent inode %lu", ino);
goto error_return;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
- if (!bitmap_bh)
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+ /* Don't bother if the inode bitmap is corrupt. */
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
goto error_return;
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -209,84 +295,97 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (fatal)
goto error_return;
- /* Ok, now we can actually update the inode bitmaps.. */
- if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
- bit, bitmap_bh->b_data))
- ext4_error (sb, "ext4_free_inode",
- "bit already cleared for inode %lu", ino);
- else {
- gdp = ext4_get_group_desc (sb, block_group, &bh2);
-
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- spin_lock(sb_bgl_lock(sbi, block_group));
- gdp->bg_free_inodes_count = cpu_to_le16(
- le16_to_cpu(gdp->bg_free_inodes_count) + 1);
- if (is_directory)
- gdp->bg_used_dirs_count = cpu_to_le16(
- le16_to_cpu(gdp->bg_used_dirs_count) - 1);
- gdp->bg_checksum = ext4_group_desc_csum(sbi,
- block_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
+ }
+ ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ ext4_error(sb, "bit already cleared for inode %lu", ino);
+ if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
}
- BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh2);
- if (!fatal) fatal = err;
- }
- BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
- if (!fatal)
- fatal = err;
- sb->s_dirt = 1;
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ }
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
}
+struct orlov_stats {
+ __u64 free_clusters;
+ __u32 free_inodes;
+ __u32 used_dirs;
+};
+
/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
*/
-static int find_group_dir(struct super_block *sb, struct inode *parent,
- ext4_group_t *best_group)
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
{
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
- unsigned int freei, avefreei;
- struct ext4_group_desc *desc, *best_desc = NULL;
- ext4_group_t group;
- int ret = -1;
+ struct ext4_group_desc *desc;
+ struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
- freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
+ if (flex_size > 1) {
+ stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+ stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
+ stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ return;
+ }
- for (group = 0; group < ngroups; group++) {
- desc = ext4_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
- continue;
- if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
- *best_group = group;
- best_desc = desc;
- ret = 0;
- }
+ desc = ext4_get_group_desc(sb, g, NULL);
+ if (desc) {
+ stats->free_inodes = ext4_free_inodes_count(sb, desc);
+ stats->free_clusters = ext4_free_group_clusters(sb, desc);
+ stats->used_dirs = ext4_used_dirs_count(sb, desc);
+ } else {
+ stats->free_inodes = 0;
+ stats->free_clusters = 0;
+ stats->used_dirs = 0;
}
- return ret;
}
/*
@@ -304,108 +403,146 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, umode_t mode,
+ const struct qstr *qstr)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- ext4_group_t ngroups = sbi->s_groups_count;
+ ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
- unsigned int freei, avefreei;
- ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
+ unsigned int freei, avefreei, grp_free;
+ ext4_fsblk_t freeb, avefreec;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
- ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ int max_dirs, min_inodes;
+ ext4_grpblk_t min_clusters;
+ ext4_group_t i, grp, g, ngroups;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+ struct dx_hash_info hinfo;
+
+ ngroups = real_ngroups;
+ if (flex_size > 1) {
+ ngroups = (real_ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- avefreeb = freeb;
- do_div(avefreeb, ngroups);
+ freeb = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ avefreec = freeb;
+ do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;
- get_random_bytes(&grp, sizeof(grp));
+ if (qstr) {
+ hinfo.hash_version = DX_HASH_HALF_MD4;
+ hinfo.seed = sbi->s_hash_seed;
+ ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+ grp = hinfo.hash;
+ } else
+ grp = prandom_u32();
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+ if (stats.free_clusters < avefreec)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= real_ngroups)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
}
- if (ret == 0)
- return ret;
goto fallback;
}
- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+ if (stats.free_inodes < min_inodes)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+ if (stats.free_clusters < min_clusters)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
- continue;
- return 0;
+ goto found_flex_bg;
}
fallback:
+ ngroups = real_ngroups;
+ avefreei = freei / ngroups;
+fallback_retry:
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && desc->bg_free_inodes_count &&
- le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
- return 0;
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
+ if (desc) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (grp_free && grp_free >= avefreei) {
+ *group = grp;
+ return 0;
+ }
+ }
}
if (avefreei) {
@@ -414,27 +551,65 @@ fallback:
* filesystems the above test can fail to find any blockgroups
*/
avefreei = 0;
- goto fallback;
+ goto fallback_retry;
}
return -1;
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, umode_t mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
- ext4_group_t i;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode, NULL);
+ }
/*
* Try to place the inode in its parent directory
*/
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_group_clusters(sb, desc))
return 0;
/*
@@ -457,8 +632,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (*group >= ngroups)
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_group_clusters(sb, desc))
return 0;
}
@@ -471,7 +646,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (++*group >= ngroups)
*group = 0;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+ if (desc && ext4_free_inodes_count(sb, desc))
return 0;
}
@@ -479,6 +654,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk. (Yes, these values are
+ * somewhat arbitrary...)
+ */
+#define RECENTCY_MIN 5
+#define RECENTCY_DIRTY 30
+
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_inode *raw_inode;
+ struct buffer_head *bh;
+ unsigned long dtime, now;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0, recentcy = RECENTCY_MIN;
+
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (unlikely(!gdp))
+ return 0;
+
+ bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ (ino / inodes_per_block));
+ if (unlikely(!bh) || !buffer_uptodate(bh))
+ /*
+ * If the block is not in the buffer cache, then it
+ * must have been written out.
+ */
+ goto out;
+
+ offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+ dtime = le32_to_cpu(raw_inode->i_dtime);
+ now = get_seconds();
+ if (buffer_dirty(bh))
+ recentcy += RECENTCY_DIRTY;
+
+ if (dtime && (dtime < now) && (now < dtime + recentcy))
+ ret = 1;
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -488,209 +708,272 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ umode_t mode, const struct qstr *qstr,
+ __u32 goal, uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks)
{
struct super_block *sb;
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *bh2;
- ext4_group_t group = 0;
+ struct buffer_head *inode_bitmap_bh = NULL;
+ struct buffer_head *group_desc_bh;
+ ext4_group_t ngroups, group = 0;
unsigned long ino = 0;
- struct inode * inode;
- struct ext4_group_desc * gdp = NULL;
- struct ext4_super_block * es;
+ struct inode *inode;
+ struct ext4_group_desc *gdp = NULL;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
- int free = 0;
+ ext4_group_t flex_group;
+ struct ext4_group_info *grp;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
+ ngroups = ext4_get_groups_count(sb);
+ trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT4_I(inode);
-
sbi = EXT4_SB(sb);
- es = sbi->s_es;
- if (S_ISDIR(mode)) {
- if (test_opt (sb, OLDALLOC))
- ret2 = find_group_dir(sb, dir, &group);
- else
- ret2 = find_group_orlov(sb, dir, &group);
+
+ /*
+ * Initalize owners and quota early so that we don't have to account
+ * for quota initialization worst case in standard inode creating
+ * transaction
+ */
+ if (owner) {
+ inode->i_mode = mode;
+ i_uid_write(inode, owner[0]);
+ i_gid_write(inode, owner[1]);
+ } else if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
} else
- ret2 = find_group_other(sb, dir, &group);
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
+
+ if (!goal)
+ goal = sbi->s_inode_goal;
+
+ if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+ ret2 = 0;
+ goto got_group;
+ }
+ if (S_ISDIR(mode))
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+ else
+ ret2 = find_group_other(sb, dir, &group, mode);
+
+got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
- for (i = 0; i < sbi->s_groups_count; i++) {
+ /*
+ * Normally we will only go through one pass of this loop,
+ * unless we get unlucky and it turns out the group we selected
+ * had its last inode grabbed by someone else.
+ */
+ for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &bh2);
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
- goto fail;
+ goto out;
- brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, group);
- if (!bitmap_bh)
- goto fail;
+ /*
+ * Check free inodes count before loading bitmap.
+ */
+ if (ext4_free_inodes_count(sb, gdp) == 0) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
+ grp = ext4_get_group_info(sb, group);
+ /* Skip groups with already-known suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
- ino = 0;
+ brelse(inode_bitmap_bh);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ /* Skip groups with suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
- bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
- if (ino < EXT4_INODES_PER_GROUP(sb)) {
-
- BUFFER_TRACE(bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
- if (err)
- goto fail;
-
- if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
- ino, bitmap_bh->b_data)) {
- /* we won it */
- BUFFER_TRACE(bitmap_bh,
- "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle,
- bitmap_bh);
- if (err)
- goto fail;
- goto got;
+ inode_bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb), ino);
+ if (ino >= EXT4_INODES_PER_GROUP(sb))
+ goto next_group;
+ if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+ ext4_error(sb, "reserved inode found cleared - "
+ "inode=%lu", ino + 1);
+ continue;
+ }
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, ino)) {
+ ino++;
+ goto next_inode;
+ }
+ if (!handle) {
+ BUG_ON(nblocks <= 0);
+ handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+ handle_type, nblocks,
+ 0);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ ext4_std_error(sb, err);
+ goto out;
}
- /* we lost it */
- jbd2_journal_release_buffer(handle, bitmap_bh);
-
- if (++ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
}
-
- /*
- * This case is possible in concurrent environment. It is very
- * rare. We cannot repeat the find_group_xxx() call because
- * that will simply return the same blockgroup, because the
- * group descriptor metadata has not yet been updated.
- * So we just go onto the next blockgroup.
- */
- if (++group == sbi->s_groups_count)
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ ext4_lock_group(sb, group);
+ ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ ext4_unlock_group(sb, group);
+ ino++; /* the inode bitmap is zero-based */
+ if (!ret2)
+ goto got; /* we grabbed the inode! */
+next_inode:
+ if (ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+next_group:
+ if (++group == ngroups)
group = 0;
}
err = -ENOSPC;
goto out;
got:
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_error(sb, __FUNCTION__,
- "reserved inode or inode > inodes count - "
- "block_group = %lu, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- err = -EIO;
- goto fail;
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
- BUFFER_TRACE(bh2, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh2);
- if (err) goto fail;
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
/* We may have to initialize the block bitmap if it isn't already */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+ if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bitmap_bh;
- BUFFER_TRACE(block_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bh);
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+ err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
- brelse(block_bh);
- goto fail;
+ brelse(block_bitmap_bh);
+ ext4_std_error(sb, err);
+ goto out;
}
- free = 0;
- spin_lock(sb_bgl_lock(sbi, group));
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+
/* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- free = ext4_free_blocks_after_init(sb, group, gdp);
- gdp->bg_free_blocks_count = cpu_to_le16(free);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
- spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
- /* Don't need to dirty bitmap block if we didn't change it */
- if (free) {
- BUFFER_TRACE(block_bh, "dirty block bitmap");
- err = ext4_journal_dirty_metadata(handle, block_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
-
- brelse(block_bh);
- if (err)
- goto fail;
}
- spin_lock(sb_bgl_lock(sbi, group));
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unsed even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- le16_to_cpu(gdp->bg_itable_unused);
}
-
/*
* Check the relative inode number against the last used
* relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
+ * we need to update the bg_itable_unused count
*/
if (ino > free)
- gdp->bg_itable_unused =
- cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ up_read(&grp->alloc_sem);
+ } else {
+ ext4_lock_group(sb, group);
}
- gdp->bg_free_inodes_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (S_ISDIR(mode)) {
- gdp->bg_used_dirs_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ }
+ }
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+
+ BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
- spin_unlock(sb_bgl_lock(sbi, group));
- BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh2);
- if (err) goto fail;
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- sb->s_dirt = 1;
- inode->i_uid = current->fsuid;
- if (test_opt (sb, GRPID))
- inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current->fsgid;
- inode->i_mode = mode;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+ }
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
@@ -702,85 +985,99 @@ got:
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
- /*
- * Don't inherit extent flag from directory. We set extent flag on
- * newly created directory and file only if -o extent mount option is
- * specified
- */
- ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
- /* dirsync only applies to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT4_DIRSYNC_FL;
+ /* Don't inherit extent flag from directory, amongst others. */
+ ei->i_flags =
+ ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_file_acl = 0;
ei->i_dtime = 0;
- ei->i_block_alloc_info = NULL;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
- handle->h_sync = 1;
- insert_inode_hash(inode);
+ ext4_handle_sync(handle);
+ if (insert_inode_locked(inode) < 0) {
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+ inode->i_ino);
+ goto out;
+ }
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- ei->i_state = EXT4_STATE_NEW;
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = cpu_to_le32(inode->i_generation);
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+ ei->i_inline_off = 0;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
ret = inode;
- if(DQUOT_ALLOC_INODE(inode)) {
- err = -EDQUOT;
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext4_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
- err = ext4_init_security(handle,inode, dir);
+ err = ext4_init_security(handle, inode, dir, qstr);
if (err)
goto fail_free_drop;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ /* set extent flag only for directory, file and normal symlink*/
+ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
goto fail_free_drop;
}
- if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for directory and file */
- if (S_ISDIR(mode) || S_ISREG(mode)) {
- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
- ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail;
- }
- }
ext4_debug("allocating inode %lu\n", inode->i_ino);
- goto really_out;
-fail:
- ext4_std_error(sb, err);
-out:
- iput(inode);
- ret = ERR_PTR(err);
-really_out:
- brelse(bitmap_bh);
+ trace_ext4_allocate_inode(inode, dir, mode);
+ brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
- DQUOT_FREE_INODE(inode);
-
+ dquot_free_inode(inode);
fail_drop:
- DQUOT_DROP(inode);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+out:
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
iput(inode);
- brelse(bitmap_bh);
+ brelse(inode_bitmap_bh);
return ERR_PTR(err);
}
@@ -796,17 +1093,15 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
- ext4_warning(sb, __FUNCTION__,
- "bad orphan ino %lu! e2fsck was run?", ino);
+ ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
goto error;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh) {
- ext4_warning(sb, __FUNCTION__,
- "inode bitmap error for orphan %lu", ino);
+ ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
goto error;
}
@@ -821,6 +1116,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -830,18 +1133,18 @@ iget_failed:
err = PTR_ERR(inode);
inode = NULL;
bad_orphan:
- ext4_warning(sb, __FUNCTION__,
- "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
+ printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
bit, (unsigned long long)bitmap_bh->b_blocknr,
ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
+ printk(KERN_WARNING "inode=%p\n", inode);
if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+ printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
@@ -852,11 +1155,11 @@ error:
return ERR_PTR(err);
}
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
{
unsigned long desc_count;
struct ext4_group_desc *gdp;
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
unsigned long bitmap_count, x;
@@ -866,32 +1169,34 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- gdp = ext4_get_group_desc (sb, i, NULL);
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, i);
+ bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh)
continue;
- x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
- printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
- le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ printk(KERN_DEBUG "ext4_count_free_inodes: "
+ "stored = %u, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
return desc_count;
#else
desc_count = 0;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- gdp = ext4_get_group_desc (sb, i, NULL);
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
cond_resched();
}
return desc_count;
@@ -899,17 +1204,122 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
}
/* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
{
unsigned long count = 0;
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- count += le16_to_cpu(gdp->bg_used_dirs_count);
+ count += ext4_used_dirs_count(sb, gdp);
}
return count;
}
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_new_inode() until we are finished.
+ */
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+ int barrier)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *group_desc_bh;
+ handle_t *handle;
+ ext4_fsblk_t blk;
+ int num, ret = 0, used_blks = 0;
+
+ /* This should not happen, but just to be sure check this */
+ if (sb->s_flags & MS_RDONLY) {
+ ret = 1;
+ goto out;
+ }
+
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp)
+ goto out;
+
+ /*
+ * We do not need to lock this, because we are the only one
+ * handling this flag.
+ */
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+ goto out;
+
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ down_write(&grp->alloc_sem);
+ /*
+ * If inode bitmap was already initialized there may be some
+ * used inodes so we need to skip blocks with used inodes in
+ * inode table.
+ */
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+ used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp)),
+ sbi->s_inodes_per_block);
+
+ if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ blk = ext4_inode_table(sb, gdp) + used_blks;
+ num = sbi->s_itb_per_group - used_blks;
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ ret = ext4_journal_get_write_access(handle,
+ group_desc_bh);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Skip zeroout if the inode table is full. But we set the ZEROED
+ * flag anyway, because obviously, when it is full it does not need
+ * further zeroing.
+ */
+ if (unlikely(num == 0))
+ goto skip_zeroout;
+
+ ext4_debug("going to zero out inode table in group %d\n",
+ group);
+ ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+ if (ret < 0)
+ goto err_out;
+ if (barrier)
+ blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+ ext4_lock_group(sb, group);
+ gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_unlock_group(sb, group);
+
+ BUFFER_TRACE(group_desc_bh,
+ "call ext4_handle_dirty_metadata");
+ ret = ext4_handle_dirty_metadata(handle, NULL,
+ group_desc_bh);
+
+err_out:
+ up_write(&grp->alloc_sem);
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 00000000000..fd69da19482
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1391 @@
+/*
+ * linux/fs/ext4/indirect.c
+ *
+ * from
+ *
+ * linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Goal-directed block allocation by Stephen Tweedie
+ * (sct@redhat.com), 1993, 1998
+ */
+
+#include <linux/aio.h>
+#include "ext4_jbd2.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+typedef struct {
+ __le32 *p;
+ __le32 key;
+ struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+ p->key = *(p->p = v);
+ p->bh = bh;
+}
+
+/**
+ * ext4_block_to_path - parse the block number into array of offsets
+ * @inode: inode in question (we are only interested in its superblock)
+ * @i_block: block number to be parsed
+ * @offsets: array to store the offsets in
+ * @boundary: set this non-zero if the referred-to block is likely to be
+ * followed (on disk) by an indirect block.
+ *
+ * To store the locations of file's data ext4 uses a data structure common
+ * for UNIX filesystems - tree of pointers anchored in the inode, with
+ * data blocks at leaves and indirect blocks in intermediate nodes.
+ * This function translates the block number into path in that tree -
+ * return value is the path length and @offsets[n] is the offset of
+ * pointer to (n+1)th node in the nth one. If @block is out of range
+ * (negative or too large) warning is printed and zero returned.
+ *
+ * Note: function doesn't find node addresses, so no IO is needed. All
+ * we need to know is the capacity of indirect blocks (taken from the
+ * inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+ ext4_lblk_t i_block,
+ ext4_lblk_t offsets[4], int *boundary)
+{
+ int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ const long direct_blocks = EXT4_NDIR_BLOCKS,
+ indirect_blocks = ptrs,
+ double_blocks = (1 << (ptrs_bits * 2));
+ int n = 0;
+ int final = 0;
+
+ if (i_block < direct_blocks) {
+ offsets[n++] = i_block;
+ final = direct_blocks;
+ } else if ((i_block -= direct_blocks) < indirect_blocks) {
+ offsets[n++] = EXT4_IND_BLOCK;
+ offsets[n++] = i_block;
+ final = ptrs;
+ } else if ((i_block -= indirect_blocks) < double_blocks) {
+ offsets[n++] = EXT4_DIND_BLOCK;
+ offsets[n++] = i_block >> ptrs_bits;
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ offsets[n++] = EXT4_TIND_BLOCK;
+ offsets[n++] = i_block >> (ptrs_bits * 2);
+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else {
+ ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+ i_block + direct_blocks +
+ indirect_blocks + double_blocks, inode->i_ino);
+ }
+ if (boundary)
+ *boundary = final - 1 - (i_block & (ptrs - 1));
+ return n;
+}
+
+/**
+ * ext4_get_branch - read the chain of indirect blocks leading to data
+ * @inode: inode in question
+ * @depth: depth of the chain (1 - direct pointer, etc.)
+ * @offsets: offsets of pointers in inode/indirect blocks
+ * @chain: place to store the result
+ * @err: here we store the error value
+ *
+ * Function fills the array of triples <key, p, bh> and returns %NULL
+ * if everything went OK or the pointer to the last filled triple
+ * (incomplete one) otherwise. Upon the return chain[i].key contains
+ * the number of (i+1)-th block in the chain (as it is stored in memory,
+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
+ * number (it points into struct inode for i==0 and into the bh->b_data
+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ * block for i>0 and NULL for i==0. In other words, it holds the block
+ * numbers of the chain, addresses they were taken from (and where we can
+ * verify that chain did not change) and buffer_heads hosting these
+ * numbers.
+ *
+ * Function stops when it stumbles upon zero pointer (absent block)
+ * (pointer to last triple returned, *@err == 0)
+ * or when it gets an IO error reading an indirect block
+ * (ditto, *@err == -EIO)
+ * or when it reads all @depth-1 indirect blocks successfully and finds
+ * the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+ ext4_lblk_t *offsets,
+ Indirect chain[4], int *err)
+{
+ struct super_block *sb = inode->i_sb;
+ Indirect *p = chain;
+ struct buffer_head *bh;
+ int ret = -EIO;
+
+ *err = 0;
+ /* i_data is not going away, no lock needed */
+ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+ if (!p->key)
+ goto no_block;
+ while (--depth) {
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
+ goto failure;
+ }
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
+ add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+ /* Reader: end */
+ if (!p->key)
+ goto no_block;
+ }
+ return NULL;
+
+failure:
+ *err = ret;
+no_block:
+ return p;
+}
+
+/**
+ * ext4_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the preferred place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same
+ * cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group. The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+ __le32 *p;
+
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--) {
+ if (*p)
+ return le32_to_cpu(*p);
+ }
+
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
+
+ /*
+ * It is going to be referred to from the inode itself? OK, just put it
+ * into the same cylinder group then.
+ */
+ return ext4_inode_to_goal_block(inode);
+}
+
+/**
+ * ext4_find_goal - find a preferred place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @partial: pointer to the last triple within a chain
+ *
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+ Indirect *partial)
+{
+ ext4_fsblk_t goal;
+
+ /*
+ * XXX need to get goal block from mballoc's data structures
+ */
+
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
+}
+
+/**
+ * ext4_blks_to_allocate - Look up the block map and count the number
+ * of direct blocks need to be allocated for the given branch.
+ *
+ * @branch: chain of indirect blocks
+ * @k: number of blocks need for indirect blocks
+ * @blks: number of data blocks to be mapped.
+ * @blocks_to_boundary: the offset in the indirect block
+ *
+ * return the total number of blocks to be allocate, including the
+ * direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+ int blocks_to_boundary)
+{
+ unsigned int count = 0;
+
+ /*
+ * Simple case, [t,d]Indirect block(s) has not allocated yet
+ * then it's clear blocks on that path have not allocated
+ */
+ if (k > 0) {
+ /* right now we don't handle cross boundary allocation */
+ if (blks < blocks_to_boundary + 1)
+ count += blks;
+ else
+ count += blocks_to_boundary + 1;
+ return count;
+ }
+
+ count++;
+ while (count < blks && count <= blocks_to_boundary &&
+ le32_to_cpu(*(branch[0].p + count)) == 0) {
+ count++;
+ }
+ return count;
+}
+
+/**
+ * ext4_alloc_branch - allocate and set up a chain of blocks.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @indirect_blks: number of allocated indirect blocks
+ * @blks: number of allocated direct blocks
+ * @goal: preferred place for allocation
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
+ *
+ * This function allocates blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext4_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext4_get_block(), except that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * their buffer_heads) and return the error value the from failed
+ * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
+{
+ struct ext4_allocation_request ar;
+ struct buffer_head * bh;
+ ext4_fsblk_t b, new_blocks[4];
+ __le32 *p;
+ int i, j, err, len = 1;
+
+ /*
+ * Set up for the direct block allocation
+ */
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.len = *blks;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ for (i = 0; i <= indirect_blks; i++) {
+ if (i == indirect_blks) {
+ ar.goal = goal;
+ new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+ } else
+ goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+ goal, 0, NULL, &err);
+ if (err) {
+ i--;
+ goto failed;
+ }
+ branch[i].key = cpu_to_le32(new_blocks[i]);
+ if (i == 0)
+ continue;
+
+ bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err) {
+ unlock_buffer(bh);
+ goto failed;
+ }
+
+ memset(bh->b_data, 0, bh->b_size);
+ p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+ b = new_blocks[i];
+
+ if (i == indirect_blks)
+ len = ar.len;
+ for (j = 0; j < len; j++)
+ *p++ = cpu_to_le32(b++);
+
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto failed;
+ }
+ *blks = ar.len;
+ return 0;
+failed:
+ for (; i >= 0; i--) {
+ /*
+ * We want to ext4_forget() only freshly allocated indirect
+ * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
+ * buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called.
+ */
+ if (i > 0 && i != indirect_blks && branch[i].bh)
+ ext4_forget(handle, 1, inode, branch[i].bh,
+ branch[i].bh->b_blocknr);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+ (i == indirect_blks) ? ar.len : 1, 0);
+ }
+ return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext4_alloc_branch)
+ * @where: location of missing link
+ * @num: number of indirect blocks we are adding
+ * @blks: number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t block, Indirect *where, int num,
+ int blks)
+{
+ int i;
+ int err = 0;
+ ext4_fsblk_t current_block;
+
+ /*
+ * If we're splicing into a [td]indirect block (as opposed to the
+ * inode) then we need to get write access to the [td]indirect block
+ * before the splice.
+ */
+ if (where->bh) {
+ BUFFER_TRACE(where->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, where->bh);
+ if (err)
+ goto err_out;
+ }
+ /* That's it */
+
+ *where->p = where->key;
+
+ /*
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+ if (num == 0 && blks > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+ for (i = 1; i < blks; i++)
+ *(where->p + i) = cpu_to_le32(current_block++);
+ }
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ /*
+ * If we spliced it onto an indirect block, we haven't
+ * altered the inode. Note however that if it is being spliced
+ * onto an indirect block at the very end of the file (the
+ * file is growing) then we *will* alter the inode to reflect
+ * the new i_size. But that is not done here - it is done in
+ * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+
+err_out:
+ for (i = 1; i <= num; i++) {
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+ blks, 0);
+
+ return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int err = -EIO;
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ ext4_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
+ int count = 0;
+ ext4_fsblk_t first_block = 0;
+
+ trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+ &blocks_to_boundary);
+
+ if (depth == 0)
+ goto out;
+
+ partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+ /* Simplest case - block found, no allocation needed */
+ if (!partial) {
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ count++;
+ /*map more blocks*/
+ while (count < map->m_len && count <= blocks_to_boundary) {
+ ext4_fsblk_t blk;
+
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
+ count++;
+ else
+ break;
+ }
+ goto got_it;
+ }
+
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ goto cleanup;
+
+ /*
+ * Okay, we need to do block allocation.
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+ "non-extent mapped inodes with bigalloc");
+ return -ENOSPC;
+ }
+
+ goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
+
+ /*
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
+ */
+ count = ext4_blks_to_allocate(partial, indirect_blks,
+ map->m_len, blocks_to_boundary);
+ /*
+ * Block out ext4_truncate while we alter the tree
+ */
+ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
+
+ /*
+ * The ext4_splice_branch call will free and forget any buffers
+ * on the new chain if there is a failure, but that risks using
+ * up transaction credits, especially for bitmaps where the
+ * credits cannot be returned. Can we handle this somehow? We
+ * may need to return -EAGAIN upwards in the worst case. --sct
+ */
+ if (!err)
+ err = ext4_splice_branch(handle, inode, map->m_lblk,
+ partial, indirect_blks, count);
+ if (err)
+ goto cleanup;
+
+ map->m_flags |= EXT4_MAP_NEW;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+ map->m_len = count;
+ if (count > blocks_to_boundary)
+ map->m_flags |= EXT4_MAP_BOUNDARY;
+ err = count;
+ /* Clean up and exit */
+ partial = chain + depth - 1; /* the whole chain */
+cleanup:
+ while (partial > chain) {
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+out:
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
+ return err;
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle;
+ ssize_t ret;
+ int orphan = 0;
+ size_t count = iov_iter_count(iter);
+ int retries = 0;
+
+ if (rw == WRITE) {
+ loff_t final_size = offset + count;
+
+ if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ ext4_journal_stop(handle);
+ }
+ }
+
+retry:
+ if (rw == READ && ext4_should_dioread_nolock(inode)) {
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
+ } else {
+locked:
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
+
+ if (unlikely((rw & WRITE) && ret < 0)) {
+ loff_t isize = i_size_read(inode);
+ loff_t end = offset + count;
+
+ if (end > isize)
+ ext4_truncate_failed_write(inode);
+ }
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ if (orphan) {
+ int err;
+
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ goto out;
+ }
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+ int blk_bits;
+
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
+
+ lblock -= EXT4_NDIR_BLOCKS;
+
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = order_base_2(lblock);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
+{
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge. So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * Try to extend this transaction for the purposes of truncation. If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ *
+ * Returns 0 if we managed to create more room. If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+ return 0;
+ if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+ return 0;
+ return 1;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+ while (p < q)
+ if (*p++)
+ return 0;
+ return 1;
+}
+
+/**
+ * ext4_find_shared - find the indirect blocks for partial truncation.
+ * @inode: inode in question
+ * @depth: depth of the affected branch
+ * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ * @chain: place to store the pointers to partial indirect blocks
+ * @top: place to the (detached) top of branch
+ *
+ * This is a helper function used by ext4_truncate().
+ *
+ * When we do truncate() we may have to clean the ends of several
+ * indirect blocks but leave the blocks themselves alive. Block is
+ * partially truncated if some data below the new i_size is referred
+ * from it (and it is on the path to the first completely truncated
+ * data block, indeed). We have to free the top of that path along
+ * with everything to the right of the path. Since no allocation
+ * past the truncation point is possible until ext4_truncate()
+ * finishes, we may safely do the latter, but top of branch may
+ * require special attention - pageout below the truncation point
+ * might try to populate it.
+ *
+ * We atomically detach the top of branch from the tree, store the
+ * block number of its root in *@top, pointers to buffer_heads of
+ * partially truncated blocks - in @chain[].bh and pointers to
+ * their last elements that should not be removed - in
+ * @chain[].p. Return value is the pointer to last filled element
+ * of @chain.
+ *
+ * The work left to caller to do the actual freeing of subtrees:
+ * a) free the subtree starting from *@top
+ * b) free the subtrees whose roots are stored in
+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ * c) free the subtrees growing from the inode past the @chain[0].
+ * (no partially truncated stuff there). */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+ ext4_lblk_t offsets[4], Indirect chain[4],
+ __le32 *top)
+{
+ Indirect *partial, *p;
+ int k, err;
+
+ *top = 0;
+ /* Make k index the deepest non-null offset + 1 */
+ for (k = depth; k > 1 && !offsets[k-1]; k--)
+ ;
+ partial = ext4_get_branch(inode, k, offsets, chain, &err);
+ /* Writer: pointers */
+ if (!partial)
+ partial = chain + k-1;
+ /*
+ * If the branch acquired continuation since we've looked at it -
+ * fine, it should all survive and (new) top doesn't belong to us.
+ */
+ if (!partial->key && *partial->p)
+ /* Writer: end */
+ goto no_top;
+ for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+ ;
+ /*
+ * OK, we've found the last block that must survive. The rest of our
+ * branch should be detached before unlocking. However, if that rest
+ * of branch is all ours and does not grow immediately from the inode
+ * it's easier to cheat and just decrement partial->p.
+ */
+ if (p == chain + k - 1 && p > chain) {
+ p->p--;
+ } else {
+ *top = *p->p;
+ /* Nope, don't do this in ext4. Must leave the tree intact */
+#if 0
+ *p->p = 0;
+#endif
+ }
+ /* Writer: end */
+
+ while (partial > p) {
+ brelse(partial->bh);
+ partial--;
+ }
+no_top:
+ return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
+{
+ __le32 *p;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
+ int err;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+ count)) {
+ EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+ "blocks %llu len %lu",
+ (unsigned long long) block_to_free, count);
+ return 1;
+ }
+
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ goto out_err;
+ err = ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ if (unlikely(err))
+ goto out_err;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ }
+
+ for (p = first; p < last; p++)
+ *p = 0;
+
+ ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+ return 0;
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle: handle for this transaction
+ * @inode: inode we are dealing with
+ * @this_bh: indirect buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *this_bh,
+ __le32 *first, __le32 *last)
+{
+ ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
+ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
+ corresponding to
+ block_to_free */
+ ext4_fsblk_t nr; /* Current block # */
+ __le32 *p; /* Pointer into inode/ind
+ for current block */
+ int err = 0;
+
+ if (this_bh) { /* For indirect block */
+ BUFFER_TRACE(this_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, this_bh);
+ /* Important: if we can't update the indirect pointers
+ * to the blocks, we can't free them. */
+ if (err)
+ return;
+ }
+
+ for (p = first; p < last; p++) {
+ nr = le32_to_cpu(*p);
+ if (nr) {
+ /* accumulate blocks to free if they're contiguous */
+ if (count == 0) {
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ } else if (nr == block_to_free + count) {
+ count++;
+ } else {
+ err = ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free, count,
+ block_to_free_p, p);
+ if (err)
+ break;
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ }
+ }
+ }
+
+ if (!err && count > 0)
+ err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+ count, block_to_free_p, p);
+ if (err < 0)
+ /* fatal error */
+ return;
+
+ if (this_bh) {
+ BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+ ext4_handle_dirty_metadata(handle, inode, this_bh);
+ else
+ EXT4_ERROR_INODE(inode,
+ "circular indirect block detected at "
+ "block %llu",
+ (unsigned long long) this_bh->b_blocknr);
+ }
+}
+
+/**
+ * ext4_free_branches - free an array of branches
+ * @handle: JBD handle for this transaction
+ * @inode: inode we are dealing with
+ * @parent_bh: the buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: pointer immediately past the end of array
+ * @depth: depth of the branches to free
+ *
+ * We are freeing all blocks referred from these branches (numbers are
+ * stored as little-endian 32-bit) and updating @inode->i_blocks
+ * appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh,
+ __le32 *first, __le32 *last, int depth)
+{
+ ext4_fsblk_t nr;
+ __le32 *p;
+
+ if (ext4_handle_is_aborted(handle))
+ return;
+
+ if (depth--) {
+ struct buffer_head *bh;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ p = last;
+ while (--p >= first) {
+ nr = le32_to_cpu(*p);
+ if (!nr)
+ continue; /* A hole */
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ nr, 1)) {
+ EXT4_ERROR_INODE(inode,
+ "invalid indirect mapped "
+ "block %lu (level %d)",
+ (unsigned long) nr, depth);
+ break;
+ }
+
+ /* Go read the buffer for the next level down */
+ bh = sb_bread(inode->i_sb, nr);
+
+ /*
+ * A read failure? Report error and clear slot
+ * (should be rare).
+ */
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, nr,
+ "Read failure");
+ continue;
+ }
+
+ /* This zaps the entire block. Bottom up. */
+ BUFFER_TRACE(bh, "free child branches");
+ ext4_free_branches(handle, inode, bh,
+ (__le32 *) bh->b_data,
+ (__le32 *) bh->b_data + addr_per_block,
+ depth);
+ brelse(bh);
+
+ /*
+ * Everything below this this pointer has been
+ * released. Now let this top-of-subtree go.
+ *
+ * We want the freeing of this indirect block to be
+ * atomic in the journal with the updating of the
+ * bitmap block which owns it. So make some room in
+ * the journal.
+ *
+ * We zero the parent pointer *after* freeing its
+ * pointee in the bitmaps, so if extend_transaction()
+ * for some reason fails to put the bitmap changes and
+ * the release into the same transaction, recovery
+ * will merely complain about releasing a free block,
+ * rather than leaking blocks.
+ */
+ if (ext4_handle_is_aborted(handle))
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ }
+
+ /*
+ * The forget flag here is critical because if
+ * we are journaling (and not doing data
+ * journaling), we have to make sure a revoke
+ * record is written to prevent the journal
+ * replay from overwriting the (former)
+ * indirect block if it gets reallocated as a
+ * data block. This must happen in the same
+ * transaction where the data blocks are
+ * actually freed.
+ */
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA|
+ EXT4_FREE_BLOCKS_FORGET);
+
+ if (parent_bh) {
+ /*
+ * The block which we have just freed is
+ * pointed to by an indirect block: journal it
+ */
+ BUFFER_TRACE(parent_bh, "get_write_access");
+ if (!ext4_journal_get_write_access(handle,
+ parent_bh)){
+ *p = 0;
+ BUFFER_TRACE(parent_bh,
+ "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle,
+ inode,
+ parent_bh);
+ }
+ }
+ }
+ } else {
+ /* We have reached the bottom of the tree. */
+ BUFFER_TRACE(parent_bh, "free data blocks");
+ ext4_free_data(handle, inode, parent_bh, first, last);
+ }
+}
+
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *i_data = ei->i_data;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ __le32 nr = 0;
+ int n = 0;
+ ext4_lblk_t last_block, max_block;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+
+ last_block = (inode->i_size + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+ if (last_block != max_block) {
+ n = ext4_block_to_path(inode, last_block, offsets, NULL);
+ if (n == 0)
+ return;
+ }
+
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
+ /*
+ * The orphan list entry will now protect us from any crash which
+ * occurs before the truncate completes, so it is now safe to propagate
+ * the new, shorter inode size (held for now in i_size) into the
+ * on-disk inode. We do this via i_disksize, which is the value which
+ * ext4 *really* writes onto the disk inode.
+ */
+ ei->i_disksize = inode->i_size;
+
+ if (last_block == max_block) {
+ /*
+ * It is unnecessary to free any data blocks if last_block is
+ * equal to the indirect block limit.
+ */
+ return;
+ } else if (n == 1) { /* direct blocks */
+ ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+ i_data + EXT4_NDIR_BLOCKS);
+ goto do_indirects;
+ }
+
+ partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ /* Kill the top of shared branch (not detached) */
+ if (nr) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1, (chain+n-1) - partial);
+ *partial->p = 0;
+ /*
+ * We mark the inode dirty prior to restart,
+ * and prior to stop. No need for it here.
+ */
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p,
+ partial->p+1, (chain+n-1) - partial);
+ }
+ }
+ /* Clear the ends of indirect blocks on the shared branch */
+ while (partial > chain) {
+ ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+ (__le32*)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+do_indirects:
+ /* Kill the remaining (whole) subtrees */
+ switch (offsets[0]) {
+ default:
+ nr = i_data[EXT4_IND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+ i_data[EXT4_IND_BLOCK] = 0;
+ }
+ case EXT4_IND_BLOCK:
+ nr = i_data[EXT4_DIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+ i_data[EXT4_DIND_BLOCK] = 0;
+ }
+ case EXT4_DIND_BLOCK:
+ nr = i_data[EXT4_TIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+ i_data[EXT4_TIND_BLOCK] = 0;
+ }
+ case EXT4_TIND_BLOCK:
+ ;
+ }
+}
+
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh, __le32 *i_data,
+ int level, ext4_lblk_t first,
+ ext4_lblk_t count, int max)
+{
+ struct buffer_head *bh = NULL;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ret = 0;
+ int i, inc;
+ ext4_lblk_t offset;
+ __le32 blk;
+
+ inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+ for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+ if (offset >= count + first)
+ break;
+ if (*i_data == 0 || (offset + inc) <= first)
+ continue;
+ blk = *i_data;
+ if (level > 0) {
+ ext4_lblk_t first2;
+ ext4_lblk_t count2;
+
+ bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
+ "Read failure");
+ return -EIO;
+ }
+ if (first > offset) {
+ first2 = first - offset;
+ count2 = count;
+ } else {
+ first2 = 0;
+ count2 = count - (offset - first);
+ }
+ ret = free_hole_blocks(handle, inode, bh,
+ (__le32 *)bh->b_data, level - 1,
+ first2, count2,
+ inode->i_sb->s_blocksize >> 2);
+ if (ret) {
+ brelse(bh);
+ goto err;
+ }
+ }
+ if (level == 0 ||
+ (bh && all_zeroes((__le32 *)bh->b_data,
+ (__le32 *)bh->b_data + addr_per_block))) {
+ ext4_free_data(handle, inode, parent_bh,
+ i_data, i_data + 1);
+ }
+ brelse(bh);
+ bh = NULL;
+ }
+
+err:
+ return ret;
+}
+
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
+{
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int level, ret = 0;
+ int num = EXT4_NDIR_BLOCKS;
+ ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+ __le32 *i_data = EXT4_I(inode)->i_data;
+
+ count = stop - first;
+ for (level = 0; level < 4; level++, max *= addr_per_block) {
+ if (first < max) {
+ ret = free_hole_blocks(handle, inode, NULL, i_data,
+ level, first, count, num);
+ if (ret)
+ goto err;
+ if (count > max - first)
+ count -= max - first;
+ else
+ break;
+ first = 0;
+ } else {
+ first -= max;
+ }
+ i_data += num;
+ if (level == 0) {
+ num = 1;
+ max = 1;
+ }
+ }
+
+err:
+ return ret;
+}
+
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 00000000000..645205d8ada
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,2000 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA "data"
+#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_OFFSET 2
+#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_get_inline_size(struct inode *inode)
+{
+ if (EXT4_I(inode)->i_inline_off)
+ return EXT4_I(inode)->i_inline_size;
+
+ return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ int free, min_offs;
+
+ min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE -
+ EXT4_I(inode)->i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header);
+
+ /*
+ * We need to subtract another sizeof(__u32) since an in-inode xattr
+ * needs an empty 4 bytes to indicate the gap between the xattr entry
+ * and the name/value pair.
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return EXT4_XATTR_SIZE(min_offs -
+ EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+ EXT4_XATTR_ROUND - sizeof(__u32));
+
+ raw_inode = ext4_raw_inode(iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /* Compute min_offs. */
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_block && entry->e_value_size) {
+ size_t offs = le16_to_cpu(entry->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs -
+ ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+ if (EXT4_I(inode)->i_inline_off) {
+ entry = (struct ext4_xattr_entry *)
+ ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+ goto out;
+ }
+
+ free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+ if (free > EXT4_XATTR_ROUND)
+ free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+ else
+ free = 0;
+
+out:
+ return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+ int error, max_inline_size;
+ struct ext4_iloc iloc;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error) {
+ ext4_error_inode(inode, __func__, __LINE__, 0,
+ "can't get inode location %lu",
+ inode->i_ino);
+ return 0;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ brelse(iloc.bh);
+
+ if (!max_inline_size)
+ return 0;
+
+ return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+ return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ if (!is.s.not_found) {
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ }
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+ unsigned int len,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ int cp_len = 0;
+ struct ext4_inode *raw_inode;
+
+ if (!len)
+ return 0;
+
+ BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+ cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+ len : EXT4_MIN_INLINE_DATA_SIZE;
+
+ raw_inode = ext4_raw_inode(iloc);
+ memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+
+ if (!len)
+ goto out;
+
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ len = min_t(unsigned int, len,
+ (unsigned int)le32_to_cpu(entry->e_value_size));
+
+ memcpy(buffer,
+ (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+ cp_len += len;
+
+out:
+ return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int cp_len = 0;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+ BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+ raw_inode = ext4_raw_inode(iloc);
+ buffer += pos;
+
+ if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+ cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+ memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+ pos += cp_len;
+ }
+
+ if (!len)
+ return;
+
+ pos -= EXT4_MIN_INLINE_DATA_SIZE;
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+
+ memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+ buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+ struct inode *inode, unsigned len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+ value = EXT4_ZERO_XATTR_VALUE;
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ value = "";
+ len = 0;
+ }
+
+ /* Insert the the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(!is.s.not_found);
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error) {
+ if (error == -ENOSPC)
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ /* If the old space is ok, write the data directly. */
+ if (len <= EXT4_I(inode)->i_inline_size)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(is.s.not_found);
+
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ value = kzalloc(len, GFP_NOFS);
+ if (!value)
+ goto out;
+
+ error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, len);
+ if (error == -ENODATA)
+ goto out;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ /* Update the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ kfree(value);
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int ret, size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return -ENOSPC;
+
+ size = ext4_get_max_inline_size(inode);
+ if (size < len)
+ return -ENOSPC;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+
+ if (ei->i_inline_off)
+ ret = ext4_update_inline_data(handle, inode, len);
+ else
+ ret = ext4_create_inline_data(handle, inode, len);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+ struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = 0, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ .value = NULL,
+ .value_len = 0,
+ };
+ int error;
+
+ if (!ei->i_inline_off)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (S_ISDIR(inode->i_mode) ||
+ S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+ ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+ EXT4_I(inode)->i_inline_off = 0;
+ EXT4_I(inode)->i_inline_size = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+ brelse(is.iloc.bh);
+ if (error == -ENODATA)
+ error = 0;
+ return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+ void *kaddr;
+ int ret = 0;
+ size_t len;
+ struct ext4_iloc iloc;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!ext4_has_inline_data(inode));
+ BUG_ON(page->index);
+
+ if (!EXT4_I(inode)->i_inline_off) {
+ ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+ inode->i_ino);
+ goto out;
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ goto out;
+
+ len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+ kaddr = kmap_atomic(page);
+ ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ brelse(iloc.bh);
+
+out:
+ return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+ int ret = 0;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return -EAGAIN;
+ }
+
+ /*
+ * Current inline data can only exist in the 1st page,
+ * So for all the other pages, just set them uptodate.
+ */
+ if (!page->index)
+ ret = ext4_read_inline_page(inode, page);
+ else if (!PageUptodate(page)) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ unlock_page(page);
+ return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags)
+{
+ int ret, needed_blocks;
+ handle_t *handle = NULL;
+ int retries = 0, sem_held = 0;
+ struct page *page = NULL;
+ unsigned from, to;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ /*
+ * clear the flag so that no new write
+ * will trap here again.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 1;
+ /* If some one has already done this for us, just exit. */
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out;
+ }
+
+ from = 0;
+ to = ext4_get_inline_size(inode);
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ if (ret)
+ goto out;
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, from, to, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, from, to, ext4_get_block);
+
+ if (!ret && ext4_should_journal_data(inode)) {
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
+ }
+
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_orphan_add(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 0;
+ ext4_journal_stop(handle);
+ handle = NULL;
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ block_commit_write(page, from, to);
+out:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (sem_held)
+ up_write(&EXT4_I(inode)->xattr_sem);
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep)
+{
+ int ret;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ if (pos + len > ext4_get_max_inline_size(inode))
+ goto convert;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ /*
+ * The possible write could happen in the inode,
+ * so try to reserve the space in inode first.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ /* We don't have space in inline inode, so convert it to extent. */
+ if (ret == -ENOSPC) {
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ goto convert;
+ }
+
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *pagep = page;
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_up_read;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_up_read;
+ }
+
+ ret = 1;
+ handle = NULL;
+out_up_read:
+ up_read(&EXT4_I(inode)->xattr_sem);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+convert:
+ return ext4_convert_inline_data_to_extent(mapping,
+ inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(page)) {
+ copied = 0;
+ goto out;
+ }
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ copied = 0;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ BUG_ON(!ext4_has_inline_data(inode));
+
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+ brelse(iloc.bh);
+out:
+ return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ return NULL;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+ kunmap_atomic(kaddr);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ * clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ * update and dirty so that ext4_da_writepages can handle it. We don't
+ * need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags,
+ void **fsdata)
+{
+ int ret = 0, inline_size;
+ struct page *page;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page)
+ return -ENOMEM;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __block_write_begin(page, 0, inline_size,
+ ext4_da_get_block_prep);
+ if (ret) {
+ ext4_truncate_failed_write(inode);
+ goto out;
+ }
+
+ SetPageDirty(page);
+ SetPageUptodate(page);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret, inline_size;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+ int retries;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ inline_size = ext4_get_max_inline_size(inode);
+
+ ret = -ENOSPC;
+ if (inline_size >= pos + len) {
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out_journal;
+ }
+
+ if (ret == -ENOSPC) {
+ ret = ext4_da_convert_inline_data_to_extent(mapping,
+ inode,
+ flags,
+ fsdata);
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out;
+ }
+
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_journal;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out_release_page;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_release_page;
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *pagep = page;
+ brelse(iloc.bh);
+ return 1;
+out_release_page:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ unlock_page(page);
+ page_cache_release(page);
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page)
+{
+ int i_size_changed = 0;
+
+ copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
+ return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+ void *inline_start, int inline_size)
+{
+ int offset;
+ unsigned short de_len;
+ struct ext4_dir_entry_2 *de = inline_start;
+ void *dlimit = inline_start + inline_size;
+
+ trace_printk("inode %lu\n", dir->i_ino);
+ offset = 0;
+ while ((void *)de < dlimit) {
+ de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+ trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+ offset, de_len, de->name_len, de->name,
+ de->name_len, le32_to_cpu(de->inode));
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ inline_start, inline_size, offset))
+ BUG();
+
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct dentry *dentry,
+ struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *inline_start, int inline_size)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ int err;
+ struct ext4_dir_entry_2 *de;
+
+ err = ext4_find_dest_de(dir, inode, iloc->bh,
+ inline_start, inline_size,
+ name, namelen, &de);
+ if (err)
+ return err;
+
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err)
+ return err;
+ ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+ header = IHDR(inode, ext4_raw_inode(iloc));
+ entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+ EXT4_I(inode)->i_inline_off);
+
+ return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+ struct ext4_dir_entry_2 *de, *prev_de;
+ void *limit;
+ int de_len;
+
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ if (old_size) {
+ limit = de_buf + old_size;
+ do {
+ prev_de = de;
+ de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+ de_buf += de_len;
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ } while (de_buf < limit);
+
+ prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+ old_size, new_size);
+ } else {
+ /* this is just created, so create an empty entry. */
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+ }
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+ struct ext4_iloc *iloc)
+{
+ int ret;
+ int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+ int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+ if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ return -ENOSPC;
+
+ ret = ext4_update_inline_data(handle, dir,
+ new_size + EXT4_MIN_INLINE_DATA_SIZE);
+ if (ret)
+ return ret;
+
+ ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+ EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE);
+ dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+ return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buf, int inline_size)
+{
+ ext4_create_inline_data(handle, inode, inline_size);
+ ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *dir_block,
+ void *buf,
+ int inline_size)
+{
+ int err, csum_size = 0, header_size = 0;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ void *target = dir_block->b_data;
+
+ /*
+ * First create "." and ".." and then copy the dir information
+ * back to the block.
+ */
+ de = (struct ext4_dir_entry_2 *)target;
+ de = ext4_init_dot_dotdot(inode, de,
+ inode->i_sb->s_blocksize, csum_size,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+ header_size = (void *)de - target;
+
+ memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ ext4_update_final_de(dir_block->b_data,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+ inode->i_sb->s_blocksize - csum_size);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data,
+ inode->i_sb->s_blocksize);
+ initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+ }
+ set_buffer_uptodate(dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int error;
+ void *buf = NULL;
+ struct buffer_head *data_bh = NULL;
+ struct ext4_map_blocks map;
+ int inline_size;
+
+ inline_size = ext4_get_inline_size(inode);
+ buf = kmalloc(inline_size, GFP_NOFS);
+ if (!buf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+ if (error < 0)
+ goto out;
+
+ error = ext4_destroy_inline_data_nolock(handle, inode);
+ if (error)
+ goto out;
+
+ map.m_lblk = 0;
+ map.m_len = 1;
+ map.m_flags = 0;
+ error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+ if (error < 0)
+ goto out_restore;
+ if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!data_bh) {
+ error = -ENOMEM;
+ goto out_restore;
+ }
+
+ lock_buffer(data_bh);
+ error = ext4_journal_get_create_access(handle, data_bh);
+ if (error) {
+ unlock_buffer(data_bh);
+ error = -EIO;
+ goto out_restore;
+ }
+ memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ memcpy(data_bh->b_data, buf, inline_size);
+ set_buffer_uptodate(data_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, data_bh);
+ } else {
+ error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+ buf, inline_size);
+ }
+
+ unlock_buffer(data_bh);
+out_restore:
+ if (error)
+ ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+ brelse(data_bh);
+ kfree(buf);
+ return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ int ret, inline_size;
+ void *inline_start;
+ struct ext4_iloc iloc;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ ret = ext4_get_inode_loc(dir, &iloc);
+ if (ret)
+ return ret;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir))
+ goto out;
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+ if (ret != -ENOSPC)
+ goto out;
+
+ /* check whether it can be inserted to inline xattr space. */
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ if (!inline_size) {
+ /* Try to use the xattr space.*/
+ ret = ext4_update_inline_dir(handle, dir, &iloc);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_size) {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+
+ if (ret != -ENOSPC)
+ goto out;
+ }
+
+ /*
+ * The inline space is filled up, so create a new block for it.
+ * As the extent tree will be created, we have to save the inline
+ * dir first.
+ */
+ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+ ext4_mark_inode_dirty(handle, dir);
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data)
+{
+ int err = 0, count = 0;
+ unsigned int parent_ino;
+ int pos;
+ struct ext4_dir_entry_2 *de;
+ struct inode *inode = file_inode(dir_file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ struct ext4_dir_entry_2 fake;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ pos = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ while (pos < inline_size) {
+ /*
+ * As inlined dir doesn't store any information about '.' and
+ * only the inode number of '..' is stored, we have to handle
+ * them differently.
+ */
+ if (pos == 0) {
+ fake.inode = cpu_to_le32(inode->i_ino);
+ fake.name_len = 1;
+ strcpy(fake.name, ".");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_OFFSET;
+ } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+ fake.inode = cpu_to_le32(parent_ino);
+ fake.name_len = 2;
+ strcpy(fake.name, "..");
+ fake.rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(fake.name_len),
+ inline_size);
+ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+ de = &fake;
+ pos = EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+ pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ if (ext4_check_dir_entry(inode, dir_file, de,
+ iloc.bh, dir_buf,
+ inline_size, pos)) {
+ ret = count;
+ goto out;
+ }
+ }
+
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ count++;
+ }
+ ret = count;
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+ struct dir_context *ctx,
+ int *has_inline_data)
+{
+ unsigned int offset, parent_ino;
+ int i;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ struct inode *inode = file_inode(file);
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+ int dotdot_offset, dotdot_size, extra_offset, extra_size;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+ sb = inode->i_sb;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+ offset = ctx->pos;
+
+ /*
+ * dotdot_offset and dotdot_size is the real offset and
+ * size for ".." and "." if the dir is block based while
+ * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+ * So we will use extra_offset and extra_size to indicate them
+ * during the inline dir iteration.
+ */
+ dotdot_offset = EXT4_DIR_REC_LEN(1);
+ dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+ extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+ extra_size = extra_offset + inline_size;
+
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (file->f_version != inode->i_version) {
+ for (i = 0; i < extra_size && i < offset;) {
+ /*
+ * "." is with offset 0 and
+ * ".." is dotdot_offset.
+ */
+ if (!i) {
+ i = dotdot_offset;
+ continue;
+ } else if (i == dotdot_offset) {
+ i = dotdot_size;
+ continue;
+ }
+ /* for other entry, the real offset in
+ * the buf has to be tuned accordingly.
+ */
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i - extra_offset);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+ < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ extra_size);
+ }
+ offset = i;
+ ctx->pos = offset;
+ file->f_version = inode->i_version;
+ }
+
+ while (ctx->pos < extra_size) {
+ if (ctx->pos == 0) {
+ if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_offset;
+ continue;
+ }
+
+ if (ctx->pos == dotdot_offset) {
+ if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+ goto out;
+ ctx->pos = dotdot_size;
+ continue;
+ }
+
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + ctx->pos - extra_offset);
+ if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+ extra_size, ctx->pos))
+ goto out;
+ if (le32_to_cpu(de->inode)) {
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto out;
+ }
+ ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
+ }
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval)
+{
+ struct ext4_iloc iloc;
+
+ *retval = ext4_get_inode_loc(inode, &iloc);
+ if (*retval)
+ return NULL;
+
+ *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+ return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+ struct inode *inode)
+{
+ int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ struct ext4_iloc iloc;
+ struct ext4_dir_entry_2 *de;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ ret = ext4_prepare_inline_data(handle, inode, inline_size);
+ if (ret)
+ goto out;
+
+ /*
+ * For inline dir, we only save the inode information for the ".."
+ * and create a fake dentry to cover the left space.
+ */
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ de->inode = cpu_to_le32(parent->i_ino);
+ de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(
+ inline_size - EXT4_INLINE_DOTDOT_SIZE,
+ inline_size);
+ set_nlink(inode, 2);
+ inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data)
+{
+ int ret;
+ struct ext4_iloc iloc;
+ void *inline_start;
+ int inline_size;
+
+ if (ext4_get_inode_loc(dir, &iloc))
+ return NULL;
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+ if (ret < 0)
+ goto out;
+
+ if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+ goto out;
+
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+
+out:
+ brelse(iloc.bh);
+ iloc.bh = NULL;
+out_find:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_start;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err)
+ return err;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+ EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+ EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ inline_start, inline_size, 0);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(err))
+ goto out;
+
+ ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+ struct ext4_iloc *iloc,
+ unsigned int offset,
+ void **inline_start,
+ int *inline_size)
+{
+ void *inline_pos;
+
+ BUG_ON(offset > ext4_get_inline_size(inode));
+
+ if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+ *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+ offset -= EXT4_MIN_INLINE_DATA_SIZE;
+ *inline_size = ext4_get_inline_size(inode) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_start)
+ *inline_start = inline_pos;
+ return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_pos;
+ unsigned int offset;
+ struct ext4_dir_entry_2 *de;
+ int ret = 1;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+ err, dir->i_ino);
+ return 1;
+ }
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ if (!le32_to_cpu(de->inode)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - no `..'",
+ dir->i_ino);
+ ret = 1;
+ goto out;
+ }
+
+ offset = EXT4_INLINE_DOTDOT_SIZE;
+ while (offset < dir->i_size) {
+ de = ext4_get_inline_entry(dir, &iloc, offset,
+ &inline_pos, &inline_size);
+ if (ext4_check_dir_entry(dir, NULL, de,
+ iloc.bh, inline_pos,
+ inline_size, offset)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - "
+ "inode %u, rec_len %u, name_len %d"
+ "inline size %d\n",
+ dir->i_ino, le32_to_cpu(de->inode),
+ le16_to_cpu(de->rec_len), de->name_len,
+ inline_size);
+ ret = 1;
+ goto out;
+ }
+ if (le32_to_cpu(de->inode)) {
+ ret = 0;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ }
+
+out:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ int error = 0;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ goto out;
+ }
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ physical += offsetof(struct ext4_inode, i_block);
+ length = i_size_read(inode);
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ brelse(iloc.bh);
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed)
+{
+ int error;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ if (EXT4_XATTR_LEN(entry->e_name_len) +
+ EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+ brelse(iloc.bh);
+ return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+ handle_t *handle;
+ int inline_size, value_len, needed_blocks;
+ size_t i_size;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
+ if (IS_ERR(handle))
+ return;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ ext4_journal_stop(handle);
+ return;
+ }
+
+ if (ext4_orphan_add(handle, inode))
+ goto out;
+
+ if (ext4_get_inode_loc(inode, &is.iloc))
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = inode->i_size;
+ inline_size = ext4_get_inline_size(inode);
+ EXT4_I(inode)->i_disksize = i_size;
+
+ if (i_size < inline_size) {
+ /* Clear the content in the xattr space. */
+ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+ if (ext4_xattr_ibody_find(inode, &i, &is))
+ goto out_error;
+
+ BUG_ON(is.s.not_found);
+
+ value_len = le32_to_cpu(is.s.here->e_value_size);
+ value = kmalloc(value_len, GFP_NOFS);
+ if (!value)
+ goto out_error;
+
+ if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, value_len))
+ goto out_error;
+
+ i.value = value;
+ i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+ i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+ if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+ goto out_error;
+ }
+
+ /* Clear the content within i_blocks. */
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+ void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+ memset(p + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ }
+
+ EXT4_I(inode)->i_inline_size = i_size <
+ EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE : i_size;
+ }
+
+out_error:
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ brelse(is.iloc.bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ kfree(value);
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+ return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+ int error, needed_blocks;
+ handle_t *handle;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ iloc.bh = NULL;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_free;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+ up_write(&EXT4_I(inode)->xattr_sem);
+out:
+ ext4_journal_stop(handle);
+out_free:
+ brelse(iloc.bh);
+ return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1f..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,20 +12,14 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*
* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
-#include <linux/ext4_jbd2.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
@@ -33,134 +27,131 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/aio.h>
+#include <linux/bitops.h>
+
+#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
-{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+#include <trace/events/ext4.h>
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
-}
+#define MPAGE_DA_EXTENT_TAIL 0x01
-/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u16 csum_lo;
+ __u16 csum_hi = 0;
+ __u32 csum;
+
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
+ raw->i_checksum_lo = 0;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
+ raw->i_checksum_hi = 0;
+ }
- might_sleep();
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+ EXT4_INODE_SIZE(inode->i_sb));
- BUFFER_TRACE(bh, "enter");
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %lx\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
+ return csum;
+}
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
+{
+ __u32 provided, calculated;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext4_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call jbd2_journal_forget");
- return ext4_journal_forget(handle, bh);
- }
- return 0;
- }
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(raw->i_checksum_lo);
+ calculated = ext4_inode_csum(inode, raw, ei);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+ else
+ calculated &= 0xFFFF;
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext4_journal_revoke");
- err = ext4_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_abort(inode->i_sb, __FUNCTION__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
+ return provided == calculated;
}
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- ext4_lblk_t needed;
-
- needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+ __u32 csum;
- /* Give ourselves just enough room to cope with inodes in which
- * i_blocks is corrupt: we've seen disk corruptions in the past
- * which resulted in random data in an inode which looked enough
- * like a regular file for ext4 to try to delete it. Things
- * will go a bit crazy if that happens, but at least we should
- * try not to panic the whole kernel. */
- if (needed < 2)
- needed = 2;
-
- /* But we need to bound the transaction so we don't overflow the
- * journal. */
- if (needed > EXT4_MAX_TRANS_DATA)
- needed = EXT4_MAX_TRANS_DATA;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
- return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+ csum = ext4_inode_csum(inode, raw, ei);
+ raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}
-/*
- * Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
{
- handle_t *result;
-
- result = ext4_journal_start(inode, blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
-
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
+ trace_ext4_begin_ordered_truncate(inode, new_size);
+ /*
+ * If jinode is zero, then we never opened the file for
+ * writing, so there's no need to call
+ * jbd2_journal_begin_ordered_truncate() since there's no
+ * outstanding writes we need to flush.
+ */
+ if (!EXT4_I(inode)->jinode)
+ return 0;
+ return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode,
+ new_size);
}
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
+
/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * Test whether an inode is a fast symlink.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
- return 0;
- if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
return 0;
- return 1;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
/*
@@ -168,40 +159,134 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, nblocks);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ return ret;
}
/*
* Called at the last iput() if i_nlink is zero.
*/
-void ext4_delete_inode (struct inode * inode)
+void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
+ int err;
- truncate_inode_pages(&inode->i_data, 0);
+ trace_ext4_evict_inode(inode);
+
+ if (inode->i_nlink) {
+ /*
+ * When journalling data dirty buffers are tracked only in the
+ * journal. So although mm thinks everything is clean and
+ * ready for reaping the inode might still have some pages to
+ * write in the running transaction or waiting to be
+ * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * (via truncate_inode_pages()) to discard these buffers can
+ * cause data loss. Also even if we did not discard these
+ * buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to
+ * read them before the transaction is checkpointed. So be
+ * careful and force everything to disk here... We use
+ * ei->i_datasync_tid to store the newest transaction
+ * containing inode's data.
+ *
+ * Note that directories do not have this problem because they
+ * don't use page cache.
+ */
+ if (ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT4_JOURNAL_INO) {
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+ jbd2_complete_transaction(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
+ goto no_delete;
+ }
+
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
+ truncate_inode_pages_final(&inode->i_data);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
- handle = start_transaction(inode);
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
* If we're going to skip the normal cleanup, we still need to
* make sure that the in-core orphan linked list is properly
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode->i_size = 0;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode dirty (err %d)", err);
+ goto stop_handle;
+ }
if (inode->i_blocks)
ext4_truncate(inode);
+
+ /*
+ * ext4_ext_truncate() doesn't reserve any slop when it
+ * restarts journal transactions; therefore there may not be
+ * enough credits left in the handle to remove the inode from
+ * the orphan list and set the dtime field.
+ */
+ if (!ext4_handle_has_enough_credits(handle, 3)) {
+ err = ext4_journal_extend(handle, 3);
+ if (err > 0)
+ err = ext4_journal_restart(handle, 3);
+ if (err != 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't extend journal (err %d)", err);
+ stop_handle:
+ ext4_journal_stop(handle);
+ ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
+ goto no_delete;
+ }
+ }
+
/*
* Kill off the orphan record which ext4_truncate created.
* AKPM: I think this can be inside the above `if'.
@@ -222,887 +307,543 @@ void ext4_delete_inode (struct inode * inode)
*/
if (ext4_mark_inode_dirty(handle, inode))
/* If that failed, just do the required in-core inode clear. */
- clear_inode(inode);
+ ext4_clear_inode(inode);
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
- clear_inode(inode); /* We must guarantee clearing of inode... */
+ ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
-typedef struct {
- __le32 *p;
- __le32 key;
- struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
{
- p->key = *(p->p = v);
- p->bh = bh;
+ return &EXT4_I(inode)->i_reserved_quota;
}
-
-/**
- * ext4_block_to_path - parse the block number into array of offsets
- * @inode: inode in question (we are only interested in its superblock)
- * @i_block: block number to be parsed
- * @offsets: array to store the offsets in
- * @boundary: set this non-zero if the referred-to block is likely to be
- * followed (on disk) by an indirect block.
- *
- * To store the locations of file's data ext4 uses a data structure common
- * for UNIX filesystems - tree of pointers anchored in the inode, with
- * data blocks at leaves and indirect blocks in intermediate nodes.
- * This function translates the block number into path in that tree -
- * return value is the path length and @offsets[n] is the offset of
- * pointer to (n+1)th node in the nth one. If @block is out of range
- * (negative or too large) warning is printed and zero returned.
- *
- * Note: function doesn't find node addresses, so no IO is needed. All
- * we need to know is the capacity of indirect blocks (taken from the
- * inode->i_sb).
- */
+#endif
/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a block located at @lblock
*/
-
-static int ext4_block_to_path(struct inode *inode,
- ext4_lblk_t i_block,
- ext4_lblk_t offsets[4], int *boundary)
-{
- int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
- const long direct_blocks = EXT4_NDIR_BLOCKS,
- indirect_blocks = ptrs,
- double_blocks = (1 << (ptrs_bits * 2));
- int n = 0;
- int final = 0;
-
- if (i_block < 0) {
- ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
- offsets[n++] = i_block;
- final = direct_blocks;
- } else if ( (i_block -= direct_blocks) < indirect_blocks) {
- offsets[n++] = EXT4_IND_BLOCK;
- offsets[n++] = i_block;
- final = ptrs;
- } else if ((i_block -= indirect_blocks) < double_blocks) {
- offsets[n++] = EXT4_DIND_BLOCK;
- offsets[n++] = i_block >> ptrs_bits;
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
- offsets[n++] = EXT4_TIND_BLOCK;
- offsets[n++] = i_block >> (ptrs_bits * 2);
- offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else {
- ext4_warning(inode->i_sb, "ext4_block_to_path",
- "block %lu > max",
- i_block + direct_blocks +
- indirect_blocks + double_blocks);
- }
- if (boundary)
- *boundary = final - 1 - (i_block & (ptrs - 1));
- return n;
-}
-
-/**
- * ext4_get_branch - read the chain of indirect blocks leading to data
- * @inode: inode in question
- * @depth: depth of the chain (1 - direct pointer, etc.)
- * @offsets: offsets of pointers in inode/indirect blocks
- * @chain: place to store the result
- * @err: here we store the error value
- *
- * Function fills the array of triples <key, p, bh> and returns %NULL
- * if everything went OK or the pointer to the last filled triple
- * (incomplete one) otherwise. Upon the return chain[i].key contains
- * the number of (i+1)-th block in the chain (as it is stored in memory,
- * i.e. little-endian 32-bit), chain[i].p contains the address of that
- * number (it points into struct inode for i==0 and into the bh->b_data
- * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- * block for i>0 and NULL for i==0. In other words, it holds the block
- * numbers of the chain, addresses they were taken from (and where we can
- * verify that chain did not change) and buffer_heads hosting these
- * numbers.
- *
- * Function stops when it stumbles upon zero pointer (absent block)
- * (pointer to last triple returned, *@err == 0)
- * or when it gets an IO error reading an indirect block
- * (ditto, *@err == -EIO)
- * or when it reads all @depth-1 indirect blocks successfully and finds
- * the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
- ext4_lblk_t *offsets,
- Indirect chain[4], int *err)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_calc_metadata_amount(inode, lblock);
- *err = 0;
- /* i_data is not going away, no lock needed */
- add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
- goto failure;
- add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
- /* Reader: end */
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-failure:
- *err = -EIO;
-no_block:
- return p;
+ return ext4_ind_calc_metadata_amount(inode, lblock);
}
-/**
- * ext4_find_near - find a place for allocation with sufficient locality
- * @inode: owner
- * @ind: descriptor of indirect block.
- *
- * This function returns the prefered place for block allocation.
- * It is used when heuristic for sequential allocation fails.
- * Rules are:
- * + if there is a block to the left of our position - allocate near it.
- * + if pointer will live in indirect block - allocate near that block.
- * + if pointer will live in inode - allocate in the same
- * cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group. The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- * Caller must make sure that @ind is valid and will stay that way.
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
*/
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
- __le32 *p;
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- /* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--) {
- if (*p)
- return le32_to_cpu(*p);
+ spin_lock(&ei->i_block_reservation_lock);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
}
- /* No such thing, so let's try location of indirect block */
- if (ind->bh)
- return ind->bh->b_blocknr;
-
- /*
- * It is going to be referred to from the inode itself? OK, just put it
- * into the same cylinder group then.
- */
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour;
-}
-
-/**
- * ext4_find_goal - find a prefered place for allocation.
- * @inode: owner
- * @block: block we want
- * @partial: pointer to the last triple within a chain
- *
- * Normally this function find the prefered place for block allocation,
- * returns it.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
- Indirect *partial)
-{
- struct ext4_block_alloc_info *block_i;
-
- block_i = EXT4_I(inode)->i_block_alloc_info;
-
- /*
- * try the heuristic for sequential allocation,
- * failing that at least try to get decent locality.
- */
- if (block_i && (block == block_i->last_alloc_logical_block + 1)
- && (block_i->last_alloc_physical_block != 0)) {
- return block_i->last_alloc_physical_block + 1;
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
}
- return ext4_find_near(inode, partial);
-}
-
-/**
- * ext4_blks_to_allocate: Look up the block map and count the number
- * of direct blocks need to be allocated for the given branch.
- *
- * @branch: chain of indirect blocks
- * @k: number of blocks need for indirect blocks
- * @blks: number of data blocks to be mapped.
- * @blocks_to_boundary: the offset in the indirect block
- *
- * return the total number of blocks to be allocate, including the
- * direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
- int blocks_to_boundary)
-{
- unsigned long count = 0;
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ used + ei->i_allocated_meta_blocks);
+ ei->i_allocated_meta_blocks = 0;
- /*
- * Simple case, [t,d]Indirect block(s) has not allocated yet
- * then it's clear blocks on that path have not allocated
- */
- if (k > 0) {
- /* right now we don't handle cross boundary allocation */
- if (blks < blocks_to_boundary + 1)
- count += blks;
- else
- count += blocks_to_boundary + 1;
- return count;
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
}
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- count++;
- while (count < blks && count <= blocks_to_boundary &&
- le32_to_cpu(*(branch[0].p + count)) == 0) {
- count++;
+ /* Update quota subsystem for data blocks */
+ if (quota_claim)
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
+ else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not re-claim the quota for fallocated blocks.
+ */
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
- return count;
-}
-
-/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- *
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @blks: on return it will store the total number of allocated
- * direct blocks
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- int target, i;
- unsigned long count = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
/*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
*/
- target = blks + indirect_blks;
-
- while (1) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
- if (*err)
- goto failed_out;
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
-
- if (count > 0)
- break;
- }
-
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
- /* total number of blocks allocated for direct blocks */
- ret = count;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i <index; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
- return ret;
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
}
-/**
- * ext4_alloc_branch - allocate and set up a chain of blocks.
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
- *
- * This function allocates blocks, zeroes out all but the last one,
- * links them into chain and (if we are synchronous) writes them to disk.
- * In other words, it prepares a branch that can be spliced onto the
- * inode. It stores the information about that chain in the branch[], in
- * the same format as ext4_get_branch() would do. We are calling it after
- * we had read the existing part of chain and partial points to the last
- * triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext4_get_block(), except that in one
- * place chain is disconnected - *branch->p is still zero (we did not
- * set the last link), but branch->key contains the number that should
- * be placed into *branch->p to fill that gap.
- *
- * If allocation fails we free all blocks we've allocated (and forget
- * their buffer_heads) and return the error value the from failed
- * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- * as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+static int __check_block_validity(struct inode *inode, const char *func,
+ unsigned int line,
+ struct ext4_map_blocks *map)
{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
-
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
- /*
- * metadata blocks and data blocks are allocated.
- */
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
- if (err) {
- unlock_buffer(bh);
- brelse(bh);
- goto failed;
- }
-
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if ( n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i=1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
-
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
- if (err)
- goto failed;
- }
- *blks = num;
- return err;
-failed:
- /* Allocation failed, free what we already allocated */
- for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, branch[i].bh);
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+ map->m_len)) {
+ ext4_error_inode(inode, func, line, map->m_pblk,
+ "lblock %lu mapped to illegal pblock "
+ "(length %d)", (unsigned long) map->m_lblk,
+ map->m_len);
+ return -EIO;
}
- for (i = 0; i <indirect_blks; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
-
- return err;
+ return 0;
}
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext4_alloc_branch)
- * @where: location of missing link
- * @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num, int blks)
-{
- int i;
- int err = 0;
- struct ext4_block_alloc_info *block_i;
- ext4_fsblk_t current_block;
+#define check_block_validity(inode, map) \
+ __check_block_validity((inode), __func__, __LINE__, (map))
- block_i = EXT4_I(inode)->i_block_alloc_info;
- /*
- * If we're splicing into a [td]indirect block (as opposed to the
- * inode) then we need to get write access to the [td]indirect block
- * before the splice.
- */
- if (where->bh) {
- BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
- if (err)
- goto err_out;
- }
- /* That's it */
-
- *where->p = where->key;
-
- /*
- * Update the host buffer_head or inode to point to more just allocated
- * direct blocks blocks
- */
- if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
- *(where->p + i ) = cpu_to_le32(current_block++);
- }
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *es_map,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int retval;
+ map->m_flags = 0;
/*
- * update the most recently allocated logical & physical block
- * in i_block_alloc_info, to assist find the proper goal block for next
- * allocation
+ * There is a race window that the result is not the same.
+ * e.g. xfstests #223 when dioread_nolock enables. The reason
+ * is that we lookup a block mapping in extent status tree with
+ * out taking i_data_sem. So at the time the unwritten extent
+ * could be converted.
*/
- if (block_i) {
- block_i->last_alloc_logical_block = block + blks - 1;
- block_i->last_alloc_physical_block =
- le32_to_cpu(where[num].key) + blks - 1;
- }
-
- /* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
-
- /* had we spliced it onto indirect block? */
- if (where->bh) {
- /*
- * If we spliced it onto an indirect block, we haven't
- * altered the inode. Note however that if it is being spliced
- * onto an indirect block at the very end of the file (the
- * file is growing) then we *will* alter the inode to reflect
- * the new i_size. But that is not done here - it is done in
- * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
- */
- jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, where->bh);
- if (err)
- goto err_out;
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- /*
- * OK, we spliced it into the inode itself on a direct block.
- * Inode was dirtied above.
- */
- jbd_debug(5, "splicing direct\n");
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
}
- return err;
-
-err_out:
- for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, where[i].bh);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(where[i-1].key), 1, 0);
- }
- ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
-
- return err;
-}
-
-/*
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
- */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize)
-{
- int err = -EIO;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- ext4_fsblk_t goal;
- int indirect_blks;
- int blocks_to_boundary = 0;
- int depth;
- struct ext4_inode_info *ei = EXT4_I(inode);
- int count = 0;
- ext4_fsblk_t first_block = 0;
-
-
- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
- J_ASSERT(handle != NULL || create == 0);
- depth = ext4_block_to_path(inode, iblock, offsets,
- &blocks_to_boundary);
-
- if (depth == 0)
- goto out;
-
- partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
- first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result);
- count++;
- /*map more blocks*/
- while (count < maxblocks && count <= blocks_to_boundary) {
- ext4_fsblk_t blk;
-
- blk = le32_to_cpu(*(chain[depth-1].p + count));
-
- if (blk == first_block + count)
- count++;
- else
- break;
- }
- goto got_it;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO)
- goto cleanup;
-
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
/*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary
- */
- if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
- ext4_init_block_alloc_info(inode);
-
- goal = ext4_find_goal(inode, iblock, partial);
-
- /* the number of blocks need to allocate for [d,t]indirect blocks */
- indirect_blks = (chain + depth) - partial - 1;
-
- /*
- * Next look up the indirect map to count the totoal number of
- * direct blocks to allocate for this branch.
+ * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+ * because it shouldn't be marked in es_map->m_flags.
*/
- count = ext4_blks_to_allocate(partial, indirect_blks,
- maxblocks, blocks_to_boundary);
- /*
- * Block out ext4_truncate while we alter the tree
- */
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
/*
- * The ext4_splice_branch call will free and forget any buffers
- * on the new chain if there is a failure, but that risks using
- * up transaction credits, especially for bitmaps where the
- * credits cannot be returned. Can we handle this somehow? We
- * may need to return -EAGAIN upwards in the worst case. --sct
+ * We don't check m_len because extent will be collpased in status
+ * tree. So the m_len might not equal.
*/
- if (!err)
- err = ext4_splice_branch(handle, inode, iblock,
- partial, indirect_blks, count);
- /*
- * i_disksize growing is protected by i_data_sem. Don't forget to
- * protect it if you're about to implement concurrent
- * ext4_get_block() -bzzz
- */
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
- if (err)
- goto cleanup;
-
- set_buffer_new(bh_result);
-got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
- if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
- err = count;
- /* Clean up and exit */
- partial = chain + depth - 1; /* the whole chain */
-cleanup:
- while (partial > chain) {
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
- BUFFER_TRACE(bh_result, "returned");
-out:
- return err;
+ if (es_map->m_lblk != map->m_lblk ||
+ es_map->m_flags != map->m_flags ||
+ es_map->m_pblk != map->m_pblk) {
+ printk("ES cache assertion failed for inode: %lu "
+ "es_cached ex [%d/%d/%llu/%x] != "
+ "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+ inode->i_ino, es_map->m_lblk, es_map->m_len,
+ es_map->m_pblk, es_map->m_flags, map->m_lblk,
+ map->m_len, map->m_pblk, map->m_flags,
+ retval, flags);
+ }
}
+#endif /* ES_AGGRESSIVE_TEST */
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
-
-
-/*
- *
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
*
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it
* mapped.
*
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
*
* It returns the error in case of allocation failure.
*/
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
- unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
+ int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
- clear_buffer_mapped(bh);
+ map->m_flags = 0;
+ ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, map->m_len,
+ (unsigned long) map->m_lblk);
/*
- * Try to see if we can get the block without requesting
- * for new file system block.
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
*/
- down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, 0, 0);
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(handle, inode, map,
+ &orig_map, flags);
+#endif
+ goto found;
+ }
+
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- retval = ext4_get_blocks_handle(handle,
- inode, block, max_blocks, bh, 0, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ }
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
}
- up_read((&EXT4_I(inode)->i_data_sem));
/* If it is only a block(s) look up */
- if (!create)
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
return retval;
/*
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns th create = 0
+ * ext4_ext_get_block() returns the create = 0
* with buffer head unmapped.
*/
- if (retval > 0 && buffer_mapped(bh))
- return retval;
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, create, extend_disksize);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
- retval = ext4_get_blocks_handle(handle, inode, block,
- max_blocks, bh, create, extend_disksize);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+ /*
+ * We allocated new blocks which will result in
+ * i_data's format changing. Force the migrate
+ * to fail by clearing migrate flags
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
}
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es))
+ goto has_zeroout;
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+
+has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
+ }
return retval;
}
-static int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int flags)
{
handle_t *handle = ext4_journal_current_handle();
+ struct ext4_map_blocks map;
int ret = 0, started = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
- if (create && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- handle = ext4_journal_start(inode, DIO_CREDITS +
- 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out;
+ return ret;
}
started = 1;
}
- ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
if (started)
ext4_journal_stop(handle);
-out:
return ret;
}
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ return _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *errp)
{
- struct buffer_head dummy;
+ struct ext4_map_blocks map;
+ struct buffer_head *bh;
int fatal = 0, err;
J_ASSERT(handle != NULL || create == 0);
- dummy.b_state = 0;
- dummy.b_blocknr = -1000;
- buffer_trace_init(&dummy.b_history);
- err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
- /*
- * ext4_get_blocks_handle() returns number of blocks
- * mapped. 0 in case of a HOLE.
- */
- if (err > 0) {
- if (err > 1)
- WARN_ON(1);
- err = 0;
+ map.m_lblk = block;
+ map.m_len = 1;
+ err = ext4_map_blocks(handle, inode, &map,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
+ if (err < 0)
+ *errp = err;
+ if (err <= 0)
+ return NULL;
+
+ bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
+ return NULL;
}
- *errp = err;
- if (!err && buffer_mapped(&dummy)) {
- struct buffer_head *bh;
- bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
- goto err;
- }
- if (buffer_new(&dummy)) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != NULL);
- /*
- * Now that we do not always journal data, we should
- * keep in mind whether this should always journal the
- * new buffer as metadata. For now, regular file
- * writes use ext4_get_block instead, so it's not a
- * problem.
- */
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext4_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data,0,inode->i_sb->s_blocksize);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
- if (!fatal)
- fatal = err;
- } else {
- BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
}
- return bh;
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
}
-err:
- return NULL;
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *err)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
bh = ext4_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1111,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -1125,10 +866,9 @@ static int walk_page_buffers( handle_t *handle,
int err, ret = 0;
struct buffer_head *next;
- for ( bh = head, block_start = 0;
- ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
- {
+ for (bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next) {
next = bh->b_this_page;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
@@ -1150,11 +890,10 @@ static int walk_page_buffers( handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -1168,103 +907,153 @@ static int walk_page_buffers( handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
+ int dirty = buffer_dirty(bh);
+ int ret;
+
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- return ext4_journal_get_write_access(handle, bh);
+ /*
+ * __block_write_begin() could have dirtied some buffers. Clean
+ * the dirty bit as jbd2_journal_get_write_access() could complain
+ * otherwise about fs integrity issues. Setting of the dirty bit
+ * by __block_write_begin() isn't a real problem here as we clear
+ * the bit before releasing a page lock and thus writeback cannot
+ * ever write the buffer.
+ */
+ if (dirty)
+ clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (!ret && dirty)
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ return ret;
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- struct inode *inode = mapping->host;
- int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ struct inode *inode = mapping->host;
+ int ret, needed_blocks;
handle_t *handle;
int retries = 0;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ trace_ext4_write_begin(inode, pos, len, flags);
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
-retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
- ret = PTR_ERR(handle);
- goto out;
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_get_block);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_journal_stop(handle);
+ goto retry_grab;
+ }
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
+ unlock_page(page);
+ /*
+ * __block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before
+ * truncate finishes
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
+
ext4_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_journal_dirty_metadata(handle, bh);
-}
-
-/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
/*
@@ -1274,168 +1063,660 @@ static int ext4_generic_write_end(struct file *file,
* ext4 never places buffers on inode->i_mapping->private_list. metadata
* buffers are managed internally.
*/
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- unsigned from, to;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
+ int i_size_changed = 0;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
- if (ret == 0) {
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
- loff_t new_i_size;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hole i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = 1;
+ }
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
- copied = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * but greater than i_disksize. (hint delalloc)
+ */
+ ext4_update_i_disksize(inode, (pos + copied));
+ i_size_changed = 1;
}
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
unlock_page(page);
page_cache_release(page);
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- int ret = 0, ret2;
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
-
- copied = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ ext4_mark_inode_dirty(handle, inode);
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
+
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
return ret ? ret : copied;
}
static int ext4_journalled_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
+ loff_t new_i_size;
+ trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
- if (pos+copied > inode->i_size)
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
+
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
+ new_i_size = pos + copied;
+ if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- if (inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
return ret ? ret : copied;
}
/*
- * bmap() is special. It gets used by applications such as lilo and by
- * the swapper to find the on-disk block of a specific piece of data.
- *
- * Naturally, this is dangerous if the block concerned is still in the
- * journal. If somebody makes a swapfile on an ext4 data-journaling
- * filesystem and enables swap, then they may get a nasty shock when the
- * data getting swapped to that swapfile suddenly gets overwritten by
- * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
- *
- * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
+ * Reserve a metadata for a single block located at lblock
*/
-static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- struct inode *inode = mapping->host;
- journal_t *journal;
- int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
- if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
+ /*
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
+ */
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ if (ret)
+ return ret;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ return -ENOSPC;
+ }
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!to_free)
+ return; /* Nothing to release, exit */
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ trace_ext4_da_release_space(inode, to_free);
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
- * This is a REALLY heavyweight approach, but the use of
- * bmap on dirty files is expected to be extremely rare:
- * only if we run lilo or swapon on a freshly made file
- * do we expect this to happen.
- *
- * (bmap requires CAP_SYS_RAWIO so this does not
- * represent an unprivileged user DOS attack --- we'd be
- * in trouble if mortal users could trigger this path at
- * will.)
- *
- * NB. EXT4_STATE_JDATA is not set on files other than
- * regular files. If somebody wants to bmap a directory
- * or symlink and gets confused because the buffer
- * hasn't yet been flushed to disk, they deserve
- * everything they get.
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
*/
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
+ }
+ ei->i_reserved_data_blocks -= to_free;
- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
- journal = EXT4_JOURNAL(inode);
- jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
- jbd2_journal_unlock_updates(journal);
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
- if (err)
+ /* update fs dirty data blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
+ int num_clusters;
+ ext4_fsblk_t lblk;
+
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if (next_off > stop)
+ break;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
+
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
+ /*
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
+ */
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
+
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
+{
+ int nr_pages, i;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
+
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
+
+ pagevec_init(&pvec, 0);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > end)
+ break;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ index = pvec.pages[nr_pages - 1]->index + 1;
+ pagevec_release(&pvec);
+ }
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(sb)));
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ ei->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
+ return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+ return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
+{
+ struct extent_status es;
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto add_delayed;
+ }
+
+ /*
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
+ */
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return 0;
+ }
+
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+ return retval;
}
- return generic_block_bmap(mapping,block,ext4_get_block);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_has_inline_data(inode)) {
+ /*
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
+ */
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+
+add_delayed:
+ if (retval == 0) {
+ int ret;
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ /*
+ * If the block was allocated from previously allocated cluster,
+ * then we don't need to reserve it again. However we still need
+ * to reserve metadata for every block we're going to write.
+ */
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = ext4_da_reserve_metadata(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ }
+
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
+ goto out_unlock;
+ }
+
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
+ */
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
+ }
+
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
+}
+
+/*
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin(). It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ struct ext4_map_blocks map;
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+ map.m_lblk = iblock;
+ map.m_len = 1;
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+ if (buffer_unwritten(bh)) {
+ /* A delayed write to unwritten bh should be marked
+ * new and mapped. Mapped ensures that we don't do
+ * get_block multiple times when we write to the same
+ * offset and new ensures that we do proper zero out
+ * for partial write.
+ */
+ set_buffer_new(bh);
+ set_buffer_mapped(bh);
+ }
+ return 0;
}
static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -1450,338 +1731,1527 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int __ext4_journalled_writepage(struct page *page,
+ unsigned int len)
{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs = NULL;
+ handle_t *handle = NULL;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
+
+ ClearPageChecked(page);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ NULL, bput_one);
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+ brelse(inode_bh);
+ return ret;
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ * - ext4_writepages after taking page lock (have journal handle)
+ * - journal_submit_inode_data_buffers (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
+ * - grab_page_cache when doing write_begin (have journal handle)
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
*
- * Problem:
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
*
* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
* ext4_writepage()
*
- * Similar for:
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned int len;
+ struct buffer_head *page_bufs = NULL;
+ struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
+
+ trace_ext4_writepage(page);
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ page_bufs = page_buffers(page);
+ /*
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
+ */
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
+ unlock_page(page);
+ return 0;
+ }
+ keep_towrite = true;
+ }
+
+ if (PageChecked(page) && ext4_should_journal_data(inode))
+ /*
+ * It's mmapped pagecache. Add buffers and journal it. There
+ * doesn't seem much point in redirtying the page here.
+ */
+ return __ext4_journalled_writepage(page, len);
+
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+ ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
+ return ret;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
+/*
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
+ */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
+
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
*
- * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
+{
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
+ }
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
*
- * Same applies to ext4_get_block(). We will deadlock on various things like
- * lock_journal and i_data_sem
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
*
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ int err;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
+ }
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
*
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
+ * @mpd - description of extent to map, on return next extent to map
*
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Up to 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
+ pagevec_release(&pvec);
+ if (err > 0)
+ err = 0;
+ return err;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err, dioread_nolock;
+
+ trace_ext4_da_write_pages_extent(inode, map);
+ /*
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an unwritten extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
+ */
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
*
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
*
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
+
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ do {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ } while (map->m_len);
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
*
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
+ * @mpd - where to look for pages
*
- * We don't honour synchronous mounts for writepage(). That would be
- * disastrous. Any write() or metadata operation will sync the fs for
- * us.
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
*/
-static int ext4_ordered_writepage(struct page *page,
- struct writeback_control *wbc)
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
- struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
+ while (index <= end) {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ goto out;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end)
+ goto out;
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
+
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
+
+ lock_page(page);
+ /*
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
+ */
+ if (!PageDirty(page) ||
+ (PageWriteback(page) &&
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+ unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ wait_on_page_writeback(page);
+ BUG_ON(PageWriteback(page));
+
+ if (mpd->map.m_len == 0)
+ mpd->first_page = page->index;
+ mpd->next_page = page->index + 1;
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ left--;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
+}
+
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
+ int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ struct mpage_da_data mpd;
+ struct inode *inode = mapping->host;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ bool done;
+ struct blk_plug plug;
+ bool give_up_on_write = false;
- J_ASSERT(PageLocked(page));
+ trace_ext4_writepages(inode, wbc);
/*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
*/
- if (ext4_journal_current_handle())
- goto out_fail;
+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ goto out_writepages;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ goto out_writepages;
}
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ /*
+ * If the filesystem has aborted, it is read-only, so return
+ * right away instead of dumping stack traces later on that
+ * will obscure the real source of the problem. We test
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+ * the latter could be true if the filesystem is mounted
+ * read-only, and in that case, ext4_writepages should
+ * *never* be called, so if that ever happens, we would want
+ * the stack trace.
+ */
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
/*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
*/
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
+ cycled = 0;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
+ } else {
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+ }
+
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
+ blk_start_plug(&plug);
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /*
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
+ */
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+ "%ld pages, ino %lu; err %d", __func__,
+ wbc->nr_to_write, inode->i_ino, ret);
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
+ }
+
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
+ }
+ ext4_journal_stop(handle);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
+ jbd2_journal_force_commit_nested(sbi->s_journal);
+ ret = 0;
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
+ break;
+ }
+ blk_finish_plug(&plug);
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
+ cycled = 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
+ goto retry;
+ }
+
+ /* Update index */
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * Set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = mpd.first_page;
+
+out_writepages:
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
+ return ret;
+}
+
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_clusters, dirty_clusters;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with percpu_counter_batch getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+ /*
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
+ */
+ return 1;
}
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
+ return 0;
+}
-out_fail:
- redirty_page_for_writepage(wbc, page);
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
+ trace_ext4_da_write_begin(inode, pos, len, flags);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
+
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
unlock_page(page);
+
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
+ }
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_journal_stop(handle);
+ goto retry_grab;
+ }
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
+
+ ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
+ }
+
+ *pagep = page;
return ret;
}
-static int ext4_writeback_writepage(struct page *page,
- struct writeback_control *wbc)
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
{
+ struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ unsigned int idx;
+ int i;
- if (ext4_journal_current_handle())
- goto out_fail;
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
+ for (i = 0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+ int write_mode = (int)(unsigned long)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+
+ trace_ext4_da_write_end(inode, pos, len, copied);
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied - 1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+ new_i_size = pos + copied;
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_i_size;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
- err = ext4_journal_stop(handle);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
if (!ret)
- ret = err;
- return ret;
+ ret = ret2;
-out_fail:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return ret;
+ return ret ? ret : copied;
}
-static int ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
- if (ext4_journal_current_handle())
- goto no_write;
+ ext4_da_page_release_reservation(page, offset, length);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
+out:
+ ext4_invalidatepage(page, offset, length);
+
+ return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ trace_ext4_alloc_da_blocks(inode);
+
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writepage() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them because we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
- if (!page_has_buffers(page) || PageChecked(page)) {
+/*
+ * bmap() is special. It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal. If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ journal_t *journal;
+ int err;
+
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
/*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
*/
- ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ filemap_write_and_wait(mapping);
+ }
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
- } else {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+ * only if we run lilo or swapon on a freshly made file
+ * do we expect this to happen.
+ *
+ * (bmap requires CAP_SYS_RAWIO so this does not
+ * represent an unprivileged user DOS attack --- we'd be
+ * in trouble if mortal users could trigger this path at
+ * will.)
+ *
+ * NB. EXT4_STATE_JDATA is not set on files other than
+ * regular files. If somebody wants to bmap a directory
+ * or symlink and gets confused because the buffer
+ * hasn't yet been flushed to disk, they deserve
+ * everything they get.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+ journal = EXT4_JOURNAL(inode);
+ jbd2_journal_lock_updates(journal);
+ err = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
+
+ if (err)
+ return 0;
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-no_write:
- redirty_page_for_writepage(wbc, page);
-out_unlock:
- unlock_page(page);
- goto out;
+ return generic_block_bmap(mapping, block, ext4_get_block);
}
static int ext4_readpage(struct file *file, struct page *page)
{
- return mpage_readpage(page, ext4_get_block);
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
+ trace_ext4_readpage(page);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ trace_ext4_invalidatepage(page, offset, length);
+
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset, length);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
+ trace_ext4_releasepage(page);
+
+ /* Page has dirty journalled data -> cannot release */
+ if (PageChecked(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ else
+ return try_to_free_buffers(page);
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+
+ /* if not async direct IO just return */
+ if (!io_end)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ iocb->private = NULL;
+ io_end->offset = offset;
+ io_end->size = size;
+ ext4_put_io_end(io_end);
}
/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
+ *
+ * The unwritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
*
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- handle_t *handle;
ssize_t ret;
- int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
+ loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
- if (rw == WRITE) {
- loff_t final_size = offset + count;
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
+ BUG_ON(iocb->private == NULL);
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
- if (orphan) {
- int err;
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- goto out;
+ if (overwrite) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * unwritten to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents unwritten.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
}
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
+
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ /*
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
+ */
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
}
- err = ext4_journal_stop(handle);
- if (ret == 0)
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(NULL, inode,
+ offset, ret);
+ if (err < 0)
ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
-out:
+
+retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ return ret;
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
+
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+ else
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -1804,88 +3274,110 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+static const struct address_space_operations ext4_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations ext4_writeback_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+static const struct address_space_operations ext4_journalled_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_journalled_write_end,
+ .set_page_dirty = ext4_journalled_set_page_dirty,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_journalled_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_journalled_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
- inode->i_mapping->a_ops = &ext4_ordered_aops;
- else if (ext4_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext4_writeback_aops;
- else
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
+ return;
+ default:
+ BUG();
+ }
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_aops;
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
- struct address_space *mapping, loff_t from)
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
+
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ max = blocksize - (offset & (blocksize - 1));
/*
- * For "nobh" option, we can only work if we don't need to
- * read-in the page - otherwise we create buffers to do the IO.
+ * correct length if it does not fall between
+ * 'from' and the end of the block
*/
- if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
- ext4_should_writeback_data(inode) && PageUptodate(page)) {
- zero_user(page, offset, length);
- set_page_dirty(page);
- goto unlock;
- }
+ if (length > max || length < 0)
+ length = max;
+
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -1898,13 +3390,10 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
iblock++;
pos += blocksize;
}
-
- err = 0;
if (buffer_freed(bh)) {
BUFFER_TRACE(bh, "freed: skip");
goto unlock;
}
-
if (!buffer_mapped(bh)) {
BUFFER_TRACE(bh, "unmapped");
ext4_get_block(inode, iblock, bh, 0);
@@ -1927,25 +3416,22 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
if (!buffer_uptodate(bh))
goto unlock;
}
-
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto unlock;
}
-
zero_user(page, offset, length);
-
BUFFER_TRACE(bh, "zeroed end of block");
- err = 0;
if (ext4_should_journal_data(inode)) {
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
} else {
- if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = 0;
mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
}
unlock:
@@ -1955,342 +3441,235 @@ unlock:
}
/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-static inline int all_zeroes(__le32 *p, __le32 *q)
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
-/**
- * ext4_find_shared - find the indirect blocks for partial truncation.
- * @inode: inode in question
- * @depth: depth of the affected branch
- * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
- * @chain: place to store the pointers to partial indirect blocks
- * @top: place to the (detached) top of branch
- *
- * This is a helper function used by ext4_truncate().
- *
- * When we do truncate() we may have to clean the ends of several
- * indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
- * from it (and it is on the path to the first completely truncated
- * data block, indeed). We have to free the top of that path along
- * with everything to the right of the path. Since no allocation
- * past the truncation point is possible until ext4_truncate()
- * finishes, we may safely do the latter, but top of branch may
- * require special attention - pageout below the truncation point
- * might try to populate it.
- *
- * We atomically detach the top of branch from the tree, store the
- * block number of its root in *@top, pointers to buffer_heads of
- * partially truncated blocks - in @chain[].bh and pointers to
- * their last elements that should not be removed - in
- * @chain[].p. Return value is the pointer to last filled element
- * of @chain.
- *
- * The work left to caller to do the actual freeing of subtrees:
- * a) free the subtree starting from *@top
- * b) free the subtrees whose roots are stored in
- * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- * c) free the subtrees growing from the inode past the @chain[0].
- * (no partially truncated stuff there). */
-
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
- ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- /* Make k index the deepest non-null offest + 1 */
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = ext4_get_branch(inode, k, offsets, chain, &err);
- /* Writer: pointers */
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p)
- /* Writer: end */
- goto no_top;
- for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- /* Nope, don't do this in ext4. Must leave the tree intact */
-#if 0
- *p->p = 0;
-#endif
- }
- /* Writer: end */
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
+
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
+
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
- while(partial > p) {
- brelse(partial->bh);
- partial--;
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
}
-no_top:
- return partial;
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
+ return err;
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
}
/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
*
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
+ * Returns: 0 on success or negative on failure
*/
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first, __le32 *last)
-{
- __le32 *p;
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, bh);
- }
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- ext4_journal_get_write_access(handle, bh);
- }
- }
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_block_offset, last_block_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ trace_ext4_punch_hole(inode, offset, length, 0);
/*
- * Any buffers which are on the journal will be in memory. We find
- * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
- * on them. We've already detached each block from the file, so
- * bforget() in jbd2_journal_forget() should be safe.
- *
- * AKPM: turn on bforget in jbd2_journal_forget()!!!
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
*/
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *tbh;
-
- *p = 0;
- tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
- }
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
}
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
-}
+ mutex_lock(&inode->i_mutex);
-/**
- * ext4_free_data - free a list of data blocks
- * @handle: handle for this transaction
- * @inode: inode we are dealing with
- * @this_bh: indirect buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: points immediately past the end of array
- *
- * We are freeing all blocks refered from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free. Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext4_free_data(handle_t *handle, struct inode *inode,
- struct buffer_head *this_bh,
- __le32 *first, __le32 *last)
-{
- ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
- __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
- corresponding to
- block_to_free */
- ext4_fsblk_t nr; /* Current block # */
- __le32 *p; /* Pointer into inode/ind
- for current block */
- int err;
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
- if (this_bh) { /* For indirect block */
- BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
- /* Important: if we can't update the indirect pointers
- * to the blocks, we can't free them. */
- if (err)
- return;
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
}
- for (p = first; p < last; p++) {
- nr = le32_to_cpu(*p);
- if (nr) {
- /* accumulate blocks to free if they're contiguous */
- if (count == 0) {
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- } else if (nr == block_to_free + count) {
- count++;
- } else {
- ext4_clear_blocks(handle, inode, this_bh,
- block_to_free,
- count, block_to_free_p, p);
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- }
- }
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
+ /*
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
+ */
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
}
- if (count > 0)
- ext4_clear_blocks(handle, inode, this_bh, block_to_free,
- count, block_to_free_p, p);
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
}
-}
-/**
- * ext4_free_branches - free an array of branches
- * @handle: JBD handle for this transaction
- * @inode: inode we are dealing with
- * @parent_bh: the buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: pointer immediately past the end of array
- * @depth: depth of the branches to free
- *
- * We are freeing all blocks refered from these branches (numbers are
- * stored as little-endian 32-bit) and updating @inode->i_blocks
- * appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
- struct buffer_head *parent_bh,
- __le32 *first, __le32 *last, int depth)
-{
- ext4_fsblk_t nr;
- __le32 *p;
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
- if (is_handle_aborted(handle))
- return;
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- if (depth--) {
- struct buffer_head *bh;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- p = last;
- while (--p >= first) {
- nr = le32_to_cpu(*p);
- if (!nr)
- continue; /* A hole */
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
- /* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- /*
- * A read failure? Report error and clear slot
- * (should be rare).
- */
- if (!bh) {
- ext4_error(inode->i_sb, "ext4_free_branches",
- "Read failure, inode=%lu, block=%llu",
- inode->i_ino, nr);
- continue;
- }
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
- /* This zaps the entire block. Bottom up. */
- BUFFER_TRACE(bh, "free child branches");
- ext4_free_branches(handle, inode, bh,
- (__le32*)bh->b_data,
- (__le32*)bh->b_data + addr_per_block,
- depth);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
- /*
- * We've probably journalled the indirect block several
- * times during the truncate. But it's no longer
- * needed and we now drop it from the transaction via
- * jbd2_journal_revoke().
- *
- * That's easy if it's exclusively part of this
- * transaction. But if it's part of the committing
- * transaction then jbd2_journal_forget() will simply
- * brelse() it. That means that if the underlying
- * block is reallocated in ext4_get_block(),
- * unmap_underlying_metadata() will find this block
- * and will try to get rid of it. damn, damn.
- *
- * If this block has already been committed to the
- * journal, a revoke record will be written. And
- * revoke records must be emitted *before* clearing
- * this block's bit in the bitmaps.
- */
- ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
- /*
- * Everything below this this pointer has been
- * released. Now let this top-of-subtree go.
- *
- * We want the freeing of this indirect block to be
- * atomic in the journal with the updating of the
- * bitmap block which owns it. So make some room in
- * the journal.
- *
- * We zero the parent pointer *after* freeing its
- * pointee in the bitmaps, so if extend_transaction()
- * for some reason fails to put the bitmap changes and
- * the release into the same transaction, recovery
- * will merely complain about releasing a free block,
- * rather than leaking blocks.
- */
- if (is_handle_aborted(handle))
- return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
- }
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- ext4_free_blocks(handle, inode, nr, 1, 1);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
- if (parent_bh) {
- /*
- * The block which we have just freed is
- * pointed to by an indirect block: journal it
- */
- BUFFER_TRACE(parent_bh, "get_write_access");
- if (!ext4_journal_get_write_access(handle,
- parent_bh)){
- *p = 0;
- BUFFER_TRACE(parent_bh,
- "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle,
- parent_bh);
- }
- }
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+ return 0;
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
}
- } else {
- /* We have reached the bottom of the tree. */
- BUFFER_TRACE(parent_bh, "free data blocks");
- ext4_free_data(handle, inode, parent_bh, first, last);
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
}
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
+ return 0;
}
/*
@@ -2300,7 +3679,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* transaction, and VFS/VM ensures that ext4_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -2323,73 +3702,61 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
void ext4_truncate(struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *i_data = ei->i_data;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ unsigned int credits;
+ handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- __le32 nr = 0;
- int n;
- ext4_lblk_t last_block;
- unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
/*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
+ * There is a possibility that we're either freeing the inode
+ * or it's a completely new inode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
*/
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ trace_ext4_truncate_enter(inode);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ if (!ext4_can_truncate(inode))
return;
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
}
- handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
- return; /* AKPM: return what? */
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
}
- last_block = (inode->i_size + blocksize-1)
- >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
- n = ext4_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
*
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
@@ -2397,96 +3764,23 @@ void ext4_truncate(struct inode *inode)
if (ext4_orphan_add(handle, inode))
goto out_stop;
- /*
- * The orphan list entry will now protect us from any crash which
- * occurs before the truncate completes, so it is now safe to propagate
- * the new, shorter inode size (held for now in i_size) into the
- * on-disk inode. We do this via i_disksize, which is the value which
- * ext4 *really* writes onto the disk inode.
- */
- ei->i_disksize = inode->i_size;
-
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
+ down_write(&EXT4_I(inode)->i_data_sem);
- if (n == 1) { /* direct blocks */
- ext4_free_data(handle, inode, NULL, i_data+offsets[0],
- i_data + EXT4_NDIR_BLOCKS);
- goto do_indirects;
- }
+ ext4_discard_preallocations(inode);
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (not detached) */
- if (nr) {
- if (partial == chain) {
- /* Shared branch grows from the inode */
- ext4_free_branches(handle, inode, NULL,
- &nr, &nr+1, (chain+n-1) - partial);
- *partial->p = 0;
- /*
- * We mark the inode dirty prior to restart,
- * and prior to stop. No need for it here.
- */
- } else {
- /* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
- ext4_free_branches(handle, inode, partial->bh,
- partial->p,
- partial->p+1, (chain+n-1) - partial);
- }
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
- (__le32*)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees */
- switch (offsets[0]) {
- default:
- nr = i_data[EXT4_IND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
- i_data[EXT4_IND_BLOCK] = 0;
- }
- case EXT4_IND_BLOCK:
- nr = i_data[EXT4_DIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
- i_data[EXT4_DIND_BLOCK] = 0;
- }
- case EXT4_DIND_BLOCK:
- nr = i_data[EXT4_TIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
- i_data[EXT4_TIND_BLOCK] = 0;
- }
- case EXT4_TIND_BLOCK:
- ;
- }
-
- ext4_discard_reservation(inode);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(handle, inode);
+ else
+ ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
+
out_stop:
/*
- * If this was a simple ftruncate(), and the file will remain alive
+ * If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
* ext4_delete_inode(), and we allow that function to clean up the
@@ -2495,56 +3789,11 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
-}
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
- unsigned long ino, struct ext4_iloc *iloc)
-{
- unsigned long desc, group_desc;
- ext4_group_t block_group;
- unsigned long offset;
- ext4_fsblk_t block;
- struct buffer_head *bh;
- struct ext4_group_desc * gdp;
-
- if (!ext4_valid_inum(sb, ino)) {
- /*
- * This error is already checked for in namei.c unless we are
- * looking at an NFS filehandle, in which case no error
- * report is needed
- */
- return 0;
- }
-
- block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
- if (block_group >= EXT4_SB(sb)->s_groups_count) {
- ext4_error(sb,"ext4_get_inode_block","group >= groups count");
- return 0;
- }
- smp_rmb();
- group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
- desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- bh = EXT4_SB(sb)->s_group_desc[group_desc];
- if (!bh) {
- ext4_error (sb, "ext4_get_inode_block",
- "Descriptor not loaded");
- return 0;
- }
-
- gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
- desc * EXT4_DESC_SIZE(sb));
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
- EXT4_INODE_SIZE(sb);
- block = ext4_inode_table(sb, gdp) +
- (offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
- iloc->block_group = block_group;
- iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
- return block;
+ trace_ext4_truncate_exit(inode);
}
/*
@@ -2556,23 +3805,45 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
static int __ext4_get_inode_loc(struct inode *inode,
struct ext4_iloc *iloc, int in_mem)
{
- ext4_fsblk_t block;
- struct buffer_head *bh;
-
- block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
- if (!block)
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t block;
+ int inodes_per_block, inode_offset;
+
+ iloc->bh = NULL;
+ if (!ext4_valid_inum(sb, inode->i_ino))
return -EIO;
- bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
- ext4_error (inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+ if (!gdp)
return -EIO;
- }
+
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ inode_offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(sb));
+ block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+ iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+ bh = sb_getblk(sb, block);
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
@@ -2586,29 +3857,13 @@ static int __ext4_get_inode_loc(struct inode *inode,
*/
if (in_mem) {
struct buffer_head *bitmap_bh;
- struct ext4_group_desc *desc;
- int inodes_per_buffer;
- int inode_offset, i;
- ext4_group_t block_group;
- int start;
-
- block_group = (inode->i_ino - 1) /
- EXT4_INODES_PER_GROUP(inode->i_sb);
- inodes_per_buffer = bh->b_size /
- EXT4_INODE_SIZE(inode->i_sb);
- inode_offset = ((inode->i_ino - 1) %
- EXT4_INODES_PER_GROUP(inode->i_sb));
- start = inode_offset & ~(inodes_per_buffer - 1);
+ int i, start;
- /* Is the inode bitmap in cache? */
- desc = ext4_get_group_desc(inode->i_sb,
- block_group, NULL);
- if (!desc)
- goto make_io;
+ start = inode_offset & ~(inodes_per_block - 1);
- bitmap_bh = sb_getblk(inode->i_sb,
- ext4_inode_bitmap(inode->i_sb, desc));
- if (!bitmap_bh)
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -2620,14 +3875,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
brelse(bitmap_bh);
goto make_io;
}
- for (i = start; i < start + inodes_per_buffer; i++) {
+ for (i = start; i < start + inodes_per_block; i++) {
if (i == inode_offset)
continue;
if (ext4_test_bit(i, bitmap_bh->b_data))
break;
}
brelse(bitmap_bh);
- if (i == start + inodes_per_buffer) {
+ if (i == start + inodes_per_block) {
/* all other inodes are free, so skip I/O */
memset(bh->b_data, 0, bh->b_size);
set_buffer_uptodate(bh);
@@ -2638,19 +3893,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
make_io:
/*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
+ if (table > b)
+ b = table;
+ end = b + ra_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (ext4_has_group_desc_csum(sb))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ sb_breadahead(sb, b++);
+ }
+
+ /*
* There are other valid inodes in the buffer, this inode
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ EXT4_ERROR_INODE_BLOCK(inode, block,
+ "unable to read itable block");
brelse(bh);
return -EIO;
}
@@ -2664,46 +3943,55 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
void ext4_get_inode_flags(struct ext4_inode_info *ei)
{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT4_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT4_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT4_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT4_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT4_DIRSYNC_FL;
+ unsigned int vfs_fl;
+ unsigned long old_fl, new_fl;
+
+ do {
+ vfs_fl = ei->vfs_inode.i_flags;
+ old_fl = ei->i_flags;
+ new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+ EXT4_DIRSYNC_FL);
+ if (vfs_fl & S_SYNC)
+ new_fl |= EXT4_SYNC_FL;
+ if (vfs_fl & S_APPEND)
+ new_fl |= EXT4_APPEND_FL;
+ if (vfs_fl & S_IMMUTABLE)
+ new_fl |= EXT4_IMMUTABLE_FL;
+ if (vfs_fl & S_NOATIME)
+ new_fl |= EXT4_NOATIME_FL;
+ if (vfs_fl & S_DIRSYNC)
+ new_fl |= EXT4_DIRSYNC_FL;
+ } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
}
+
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
+ struct ext4_inode_info *ei)
{
blkcnt_t i_blocks ;
struct inode *inode = &(ei->vfs_inode);
@@ -2714,7 +4002,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
- if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
/* i_blocks represent file system block size */
return i_blocks << (inode->i_blkbits - 9);
} else {
@@ -2725,15 +4013,30 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -2742,27 +4045,58 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- ei->i_acl = EXT4_ACL_NOT_CACHED;
- ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
- ei->i_block_alloc_info = NULL;
+ iloc.bh = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ ret = -EIO;
+ goto bad_inode;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = raw_inode->i_generation;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid");
+ ret = -EIO;
+ goto bad_inode;
+ }
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
- if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ei->i_state = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -2771,30 +4105,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
- brelse (bh);
ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- }
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
@@ -2803,40 +4141,79 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- brelse (bh);
- ret = -EIO;
- goto bad_inode;
- }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
- } else
- ei->i_extra_isize = 0;
+ }
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
+ ret = 0;
+ if (ei->i_file_acl &&
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ei->i_file_acl);
+ ret = -EIO;
+ goto bad_inode;
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
+ }
+ if (ret)
+ goto bad_inode;
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
@@ -2845,13 +4222,16 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode))
+ if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
@@ -2859,13 +4239,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
+ } else {
+ ret = -EIO;
+ EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+ goto bad_inode;
}
- brelse (iloc.bh);
+ brelse(iloc.bh);
ext4_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -2877,46 +4264,36 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
- int err = 0;
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
- } else if (i_blocks <= 0xffffffffffffULL) {
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- /* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
} else {
- /*
- * i_blocks should be represented in a 48 bit variable
- * as multiple of file system block size
- */
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- ei->i_flags |= EXT4_HUGE_FILE_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
-err_out:
- return err;
+ return 0;
}
/*
@@ -2933,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
+ int need_datasync = 0, set_large_file = 0;
+ uid_t i_uid;
+ gid_t i_gid;
+
+ spin_lock(&ei->i_raw_lock);
- /* For fields not not tracking in the in-memory inode,
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
- if(!ei->i_dtime) {
+ if (!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -2973,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -3017,26 +4389,45 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
+ ext4_inode_csum_set(inode, raw_inode, ei);
+
+ spin_unlock(&ei->i_raw_lock);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- rc = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT4_STATE_NEW;
-
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
- brelse (bh);
+ brelse(bh);
ext4_std_error(inode->i_sb, err);
return err;
}
@@ -3046,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3072,25 +4462,95 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ int err;
+
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
+
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
+
+ err = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (err)
+ return err;
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+ "IO error syncing inode");
+ err = -EIO;
+ }
+ brelse(iloc.bh);
}
+ return err;
+}
- if (!wait)
- return 0;
+/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
- return ext4_force_commit(inode->i_sb);
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
}
/*
@@ -3108,31 +4568,42 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
int error, rc = 0;
+ int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext4_journal_stop(handle);
return error;
@@ -3147,45 +4618,97 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes) {
- error = -EFBIG;
- goto err_out;
- }
+ if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ return -EFBIG;
}
- }
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
- handle_t *handle;
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error)
+ goto err_out;
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ /*
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
+ */
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
- error = ext4_orphan_add(handle, inode);
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
+ }
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, inode->i_size);
}
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
- rc = inode_setattr(inode, attr);
+ if (!rc) {
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ }
- /* If inode_setattr's call to ext4_truncate failed to get a
- * transaction handle at all, we need to clean up the in-core
- * orphan list manually. */
- if (inode->i_nlink)
+ /*
+ * If the call to ext4_truncate failed to get a transaction handle at
+ * all, we need to clean up the in-core orphan list manually.
+ */
+ if (orphan && inode->i_nlink)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -3194,67 +4717,142 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
+ return 0;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
+{
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
+}
/*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances. If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched. Pah.
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
*/
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
+{
+ ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+ int gdpblocks;
+ int idxblocks;
+ int ret = 0;
+
+ /*
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
+ */
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
+
+ ret = idxblocks;
+ /*
+ * Now let's see how many group bitmaps and group descriptors need
+ * to account
+ */
+ groups = idxblocks + pextents;
+ gdpblocks = groups;
+ if (groups > ngroups)
+ groups = ngroups;
+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+ /* bitmaps and block group descriptor blocks */
+ ret += groups + gdpblocks;
+
+ /* Blocks for super block, inode, quota and xattr blocks */
+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
int ext4_writepage_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_page(inode);
- int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
int ret;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_writepage_trans_blocks(inode, bpp);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
+ /* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret = 3 * (bpp + indirects) + 2;
- else
- ret = 2 * (bpp + indirects) + 2;
-
-#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during DQUOT_INIT so
- * we will be updating only the data blocks + inodes */
- ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
+ ret += bpp;
return ret;
}
/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
* The caller must have previously called ext4_reserve_inode_write().
* Give this, we know that the caller already has write access to iloc->bh.
*/
int ext4_mark_iloc_dirty(handle_t *handle,
- struct inode *inode, struct ext4_iloc *iloc)
+ struct inode *inode, struct ext4_iloc *iloc)
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
+ if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
@@ -3275,16 +4873,15 @@ int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc)
{
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, iloc);
- if (!err) {
- BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
- if (err) {
- brelse(iloc->bh);
- iloc->bh = NULL;
- }
+ int err;
+
+ err = ext4_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
}
}
ext4_std_error(inode->i_sb, err);
@@ -3302,7 +4899,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_entry *entry;
if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
return 0;
@@ -3310,11 +4906,10 @@ static int ext4_expand_extra_isize(struct inode *inode,
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -3338,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -3355,9 +4942,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err, ret;
might_sleep();
+ trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ if (ext4_handle_valid(handle) &&
+ EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -3371,10 +4960,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
- ext4_warning(inode->i_sb, __FUNCTION__,
+ ext4_warning(inode->i_sb,
"Unable to expand inode %lu. Delete"
" some EAs or run e2fsck.",
inode->i_ino);
@@ -3396,31 +4986,23 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __FUNCTION__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
@@ -3445,8 +5027,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
BUFFER_TRACE(iloc.bh, "get_write_access");
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
- err = ext4_journal_dirty_metadata(handle,
- iloc.bh);
+ err = ext4_handle_dirty_metadata(handle,
+ NULL,
+ iloc.bh);
brelse(iloc.bh);
}
}
@@ -3472,11 +5055,27 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
journal = EXT4_JOURNAL(inode);
+ if (!journal)
+ return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -3487,23 +5086,119 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
- else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ else {
+ jbd2_journal_flush(journal);
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_mark_inode_dirty(handle, inode);
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
ext4_journal_stop(handle);
ext4_std_error(inode->i_sb, err);
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ loff_t size;
+ unsigned long len;
+ int ret;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
+ }
+
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ /*
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
+ */
+ if (page_has_buffers(page)) {
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
+ /* Wait so that we don't change page under IO */
+ wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
+ }
+ }
+ unlock_page(page);
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897..0f2252ec274 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -10,21 +10,202 @@
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/capability.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
#include <linux/time.h>
#include <linux/compat.h>
-#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
-int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
- unsigned long arg)
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a: pointer to first memory area
+ * @b: pointer to second memory area
+ * @len: number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+ unsigned char *ap, *bp;
+ unsigned char tmp;
+
+ ap = (unsigned char *)a;
+ bp = (unsigned char *)b;
+ while (len-- > 0) {
+ tmp = *ap;
+ *ap = *bp;
+ *bp = tmp;
+ ap++;
+ bp++;
+ }
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1: pointer to first inode
+ * @inode2: pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+ loff_t isize;
+ struct ext4_inode_info *ei1;
+ struct ext4_inode_info *ei2;
+
+ ei1 = EXT4_I(inode1);
+ ei2 = EXT4_I(inode2);
+
+ memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+ memswap(&inode1->i_version, &inode2->i_version,
+ sizeof(inode1->i_version));
+ memswap(&inode1->i_blocks, &inode2->i_blocks,
+ sizeof(inode1->i_blocks));
+ memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+ memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+ memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+ memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+ memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+ memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode1);
+ ext4_es_lru_del(inode2);
+
+ isize = i_size_read(inode1);
+ i_size_write(inode1, i_size_read(inode2));
+ i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb: the super block of the filesystem
+ * @inode: the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+ struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+ struct inode *inode_bl;
+ struct ext4_inode_info *ei_bl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ if (IS_ERR(inode_bl))
+ return PTR_ERR(inode_bl);
+ ei_bl = EXT4_I(inode_bl);
+
+ filemap_flush(inode->i_mapping);
+ filemap_flush(inode_bl->i_mapping);
+
+ /* Protect orig inodes against a truncate and make sure,
+ * that only 1 swap_inode_boot_loader is running. */
+ lock_two_nondirectories(inode, inode_bl);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ ext4_inode_block_unlocked_dio(inode_bl);
+ inode_dio_wait(inode);
+ inode_dio_wait(inode_bl);
+
+ handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+ if (IS_ERR(handle)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(inode, inode_bl);
+
+ if (inode_bl->i_nlink == 0) {
+ /* this inode has never been used as a BOOT_LOADER */
+ set_nlink(inode_bl, 1);
+ i_uid_write(inode_bl, 0);
+ i_gid_write(inode_bl, 0);
+ inode_bl->i_flags = 0;
+ ei_bl->i_flags = 0;
+ inode_bl->i_version = 1;
+ i_size_write(inode_bl, 0);
+ inode_bl->i_mode = S_IFREG;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode_bl);
+ } else
+ memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+ }
+
+ swap_inode_data(inode, inode_bl);
+
+ inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ inode_bl->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ext4_discard_preallocations(inode);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err < 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ } else {
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ /* Revert all changes: */
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ ext4_journal_stop(handle);
+ ext4_double_up_write_data_sem(inode, inode_bl);
+
+journal_err_out:
+ ext4_inode_resume_unlocked_dio(inode);
+ ext4_inode_resume_unlocked_dio(inode_bl);
+ unlock_two_nondirectories(inode, inode_bl);
+ iput(inode_bl);
+ return err;
+}
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
- unsigned short rsv_window_size;
- ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
case EXT4_IOC_GETFLAGS:
@@ -33,29 +214,29 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
handle_t *handle = NULL;
- int err;
+ int err, migrate = 0;
struct ext4_iloc iloc;
- unsigned int oldflags;
+ unsigned int oldflags, mask, i;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT4_DIRSYNC_FL;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ flags = ext4_mask_flags(inode->i_mode, flags);
+ err = -EPERM;
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +249,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
}
/*
@@ -79,27 +258,40 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
}
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+ migrate = 1;
+
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
-
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
- mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto flags_err;
- flags = flags & EXT4_FL_USER_MODIFIABLE;
- flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
- ei->i_flags = flags;
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
ext4_set_inode_flags(inode);
inode->i_ctime = ext4_current_time(inode);
@@ -107,14 +299,23 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext4_journal_stop(handle);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- return err;
- }
+ if (err)
+ goto flags_out;
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
+ if (err)
+ goto flags_out;
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
+flags_out:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -127,16 +328,30 @@ flags_err:
__u32 generation;
int err;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
- handle = ext4_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ ext4_warning(sb, "Setting inode version is not "
+ "supported with metadata_csum enabled.");
+ return -ENOTTY;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto unlock_out;
+ }
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = ext4_current_time(inode);
@@ -144,118 +359,257 @@ flags_err:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
+setversion_out:
+ mnt_drop_write_file(filp);
return err;
}
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
- return ret;
+ case EXT4_IOC_GROUP_EXTEND: {
+ ext4_fsblk_t n_blocks_count;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
}
-#endif
- case EXT4_IOC_GETRSVSZ:
- if (test_opt(inode->i_sb, RESERVATION)
- && S_ISREG(inode->i_mode)
- && ei->i_block_alloc_info) {
- rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
- return put_user(rsv_window_size, (int __user *)arg);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
}
- return -ENOTTY;
- case EXT4_IOC_SETRSVSZ: {
- if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
- return -ENOTTY;
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto group_extend_out;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+group_extend_out:
+ ext4_resize_end(sb);
+ return err;
+ }
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ case EXT4_IOC_MOVE_EXT: {
+ struct move_extent me;
+ struct fd donor;
+ int err;
- if (get_user(rsv_window_size, (int __user *)arg))
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
+ me.moved_len = 0;
- if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
- rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
+ donor = fdget(me.donor_fd);
+ if (!donor.file)
+ return -EBADF;
- /*
- * need to allocate reservation structure for this inode
- * before set the window size
- */
- down_write(&ei->i_data_sem);
- if (!ei->i_block_alloc_info)
- ext4_init_block_alloc_info(inode);
+ if (!(donor.file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto mext_out;
+ }
- if (ei->i_block_alloc_info){
- struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
- rsv->rsv_goal_size = rsv_window_size;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto mext_out;
}
- up_write(&ei->i_data_sem);
- return 0;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto mext_out;
+
+ err = ext4_move_extents(filp, donor.file, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ mnt_drop_write_file(filp);
+
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
+ err = -EFAULT;
+mext_out:
+ fdput(donor);
+ return err;
}
- case EXT4_IOC_GROUP_EXTEND: {
- ext4_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
- int err;
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
+ case EXT4_IOC_GROUP_ADD: {
+ struct ext4_new_group_data input;
+ int err, err2=0;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
- err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- jbd2_journal_flush(EXT4_SB(sb)->s_journal);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto group_add_out;
+ err = ext4_group_add(sb, &input);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input.group);
+group_add_out:
+ ext4_resize_end(sb);
return err;
}
- case EXT4_IOC_GROUP_ADD: {
- struct ext4_new_group_data input;
- struct super_block *sb = inode->i_sb;
+
+ case EXT4_IOC_MIGRATE:
+ {
int err;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ /*
+ * inode_mutex prevent write and truncate on the file.
+ * Read still goes through. We take i_data_sem in
+ * ext4_ext_swap_inode_data before we switch the
+ * inode format to prevent read.
+ */
+ mutex_lock(&(inode->i_mutex));
+ err = ext4_ext_migrate(inode);
+ mutex_unlock(&(inode->i_mutex));
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_ALLOC_DA_BLKS:
+ {
+ int err;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
- if (IS_RDONLY(inode))
- return -EROFS;
+ case EXT4_IOC_SWAP_BOOT:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ return swap_inode_boot_loader(sb, inode);
- if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input)))
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
return -EFAULT;
+ }
- err = ext4_group_add(sb, &input);
- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- jbd2_journal_flush(EXT4_SB(sb)->s_journal);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+ if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
+resizefs_out:
+ ext4_resize_end(sb);
return err;
}
- case EXT4_IOC_MIGRATE:
- return ext4_ext_migrate(inode, filp, cmd, arg);
+ case FITRIM:
+ {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = ext4_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
default:
return -ENOTTY;
@@ -265,9 +619,6 @@ flags_err:
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- int ret;
-
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
case EXT4_IOC32_GETFLAGS:
@@ -291,25 +642,43 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETVERSION_OLD:
cmd = EXT4_IOC_SETVERSION_OLD;
break;
-#ifdef CONFIG_JBD2_DEBUG
- case EXT4_IOC32_WAIT_FOR_READONLY:
- cmd = EXT4_IOC_WAIT_FOR_READONLY;
- break;
-#endif
case EXT4_IOC32_GETRSVSZ:
cmd = EXT4_IOC_GETRSVSZ;
break;
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
- case EXT4_IOC_GROUP_ADD:
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
+ case EXT4_IOC_MOVE_EXT:
+ case FITRIM:
+ case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
}
- lock_kernel();
- ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
- unlock_kernel();
- return ret;
+ return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ef97f19c2f9..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,20 +21,19 @@
* mballoc.c contains the multiblocks allocation routines
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+#include <linux/log2.h>
#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-#include <linux/version.h>
-#include "group.h"
+#include <linux/slab.h>
+#include <trace/events/ext4.h>
+
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
/*
* MUSTDO:
@@ -60,32 +59,33 @@
* The allocation request involve request for multiple number of blocks
* near to the goal(block) value specified.
*
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
*
* The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
*
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> lenght for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
- * space we will consume the particular prealloc space. This make sure that
- * that the we have contiguous physical blocks representing the file blocks
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
*
* The important thing to be noted in case of inode prealloc space is that
* we don't modify the values associated to inode prealloc space except
@@ -93,7 +93,7 @@
*
* If we are not able to find blocks in the inode prealloc space and if we
* have the group allocation flag set then we look at the locality group
- * prealloc space. These are per CPU prealloc list repreasented as
+ * prealloc space. These are per CPU prealloc list represented as
*
* ext4_sb_info.s_locality_groups[smp_processor_id()]
*
@@ -101,7 +101,7 @@
* between CPUs. It is possible to get scheduled at this point.
*
* The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
*
* If we can't allocate blocks via inode prealloc or/and locality group
* prealloc then we look at the buddy cache. The buddy cache is represented
@@ -114,7 +114,7 @@
* inode as:
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
@@ -135,33 +135,35 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
* 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * stripe value (sbi->s_stripe)
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
*
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator (using the buddy cache) supports a few tunables.
*
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
*
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
* value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
* min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
* best extent in the found extents. Searching for the blocks starts with
* the group specified as the goal value in allocation context via
* ac_g_ex. Each group is first checked based on the criteria whether it
- * can used for allocation. ext4_mb_good_group explains how the groups are
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
* Both the prealloc space are getting populated as above. So for the first
@@ -344,288 +346,28 @@
* object
*
*/
-
-/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
- */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...) printk(fmt, ##a)
-#else
-#define mb_debug(fmt, a...)
-#endif
-
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
-#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
-#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE 8 /* free */
-
-#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
- EXT4_MB_HISTORY_PREALLOC)
-
-/*
- * How long mballoc can look for a best extent (in found extents)
- */
-#define MB_DEFAULT_MAX_TO_SCAN 200
-
-/*
- * How long mballoc must look for a best extent
- */
-#define MB_DEFAULT_MIN_TO_SCAN 10
-
-/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
-
-/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
- * shown at umount. The collecting costs though!
- */
-#define MB_DEFAULT_STATS 1
-
-/*
- * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
- * by the stream allocator, which purpose is to pack requests
- * as close each to other as possible to produce smooth I/O traffic
- * We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
- */
-#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
-
-/*
- * for which requests use 2^N search using buddies
- */
-#define MB_DEFAULT_ORDER2_REQS 2
-
-/*
- * default group prealloc size 512 blocks
- */
-#define MB_DEFAULT_GROUP_PREALLOC 512
-
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
-
-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
- struct list_head list;
-};
-
-struct ext4_group_info {
- unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
- struct list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
- void *bb_bitmap;
-#endif
- unsigned short bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
-#define EXT4_GROUP_INFO_LOCKED_BIT 1
-
-#define EXT4_MB_GRP_NEED_INIT(grp) \
- (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
-struct ext4_prealloc_space {
- struct list_head pa_inode_list;
- struct list_head pa_group_list;
- union {
- struct list_head pa_tmp_list;
- struct rcu_head pa_rcu;
- } u;
- spinlock_t pa_lock;
- atomic_t pa_count;
- unsigned pa_deleted;
- ext4_fsblk_t pa_pstart; /* phys. block */
- ext4_lblk_t pa_lstart; /* log. block */
- unsigned short pa_len; /* len of preallocated chunk */
- unsigned short pa_free; /* how many blocks are free */
- unsigned short pa_linear; /* consumed in one direction
- * strictly, for grp prealloc */
- spinlock_t *pa_obj_lock;
- struct inode *pa_inode; /* hack, for history only */
-};
-
-
-struct ext4_free_extent {
- ext4_lblk_t fe_logical;
- ext4_grpblk_t fe_start;
- ext4_group_t fe_group;
- int fe_len;
+static struct kmem_cache *ext4_free_data_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size. There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+ "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+ "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+ "ext4_groupinfo_64k", "ext4_groupinfo_128k"
};
-/*
- * Locality group:
- * we try to group all related changes together
- * so that writeback can flush/allocate them together as well
- */
-struct ext4_locality_group {
- /* for allocator */
- struct mutex lg_mutex; /* to serialize allocates */
- struct list_head lg_prealloc_list;/* list of preallocations */
- spinlock_t lg_prealloc_lock;
-};
-
-struct ext4_allocation_context {
- struct inode *ac_inode;
- struct super_block *ac_sb;
-
- /* original request */
- struct ext4_free_extent ac_o_ex;
-
- /* goal request (after normalization) */
- struct ext4_free_extent ac_g_ex;
-
- /* the best found extent */
- struct ext4_free_extent ac_b_ex;
-
- /* copy of the bext found extent taken before preallocation efforts */
- struct ext4_free_extent ac_f_ex;
-
- /* number of iterations done. we have to track to limit searching */
- unsigned long ac_ex_scanned;
- __u16 ac_groups_scanned;
- __u16 ac_found;
- __u16 ac_tail;
- __u16 ac_buddy;
- __u16 ac_flags; /* allocation hints */
- __u8 ac_status;
- __u8 ac_criteria;
- __u8 ac_repeats;
- __u8 ac_2order; /* if request is to allocate 2^N blocks and
- * N > 0, the field stores N, otherwise 0 */
- __u8 ac_op; /* operation, for history only */
- struct page *ac_bitmap_page;
- struct page *ac_buddy_page;
- struct ext4_prealloc_space *ac_pa;
- struct ext4_locality_group *ac_lg;
-};
-
-#define AC_STATUS_CONTINUE 1
-#define AC_STATUS_FOUND 2
-#define AC_STATUS_BREAK 3
-
-struct ext4_mb_history {
- struct ext4_free_extent orig; /* orig allocation */
- struct ext4_free_extent goal; /* goal allocation */
- struct ext4_free_extent result; /* result allocation */
- unsigned pid;
- unsigned ino;
- __u16 found; /* how many extents have been found */
- __u16 groups; /* how many groups have been scanned */
- __u16 tail; /* what tail broke some buddy */
- __u16 buddy; /* buddy the tail ^^^ broke */
- __u16 flags;
- __u8 cr:3; /* which phase the result extent was found at */
- __u8 op:4;
- __u8 merged:1;
-};
-
-struct ext4_buddy {
- struct page *bd_buddy_page;
- void *bd_buddy;
- struct page *bd_bitmap_page;
- void *bd_bitmap;
- struct ext4_group_info *bd_info;
- struct super_block *bd_sb;
- __u16 bd_blkbits;
- ext4_group_t bd_group;
-};
-#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- return;
-}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
-#endif
-
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
-static struct proc_dir_entry *proc_root_ext4;
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp);
-
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b, sector_t block,
- int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
- struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
- &(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
- struct ext4_free_extent *fex)
-{
- ext4_fsblk_t block;
-
- block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
- + fex->fe_start
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- return block;
-}
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -657,49 +399,49 @@ static inline void mb_set_bit(int bit, void *addr)
ext4_set_bit(bit, addr);
}
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
- addr = mb_correct_addr_and_bit(&bit, addr);
- ext4_set_bit_atomic(lock, bit, addr);
-}
-
static inline void mb_clear_bit(int bit, void *addr)
{
addr = mb_correct_addr_and_bit(&bit, addr);
ext4_clear_bit(bit, addr);
}
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
+static inline int mb_test_and_clear_bit(int bit, void *addr)
{
addr = mb_correct_addr_and_bit(&bit, addr);
- ext4_clear_bit_atomic(lock, bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
}
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(max == NULL);
if (order > e4b->bd_blkbits + 1) {
@@ -708,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
}
/* at order 0 we see each particular block */
- *max = 1 << (e4b->bd_blkbits + 3);
- if (order == 0)
- return EXT4_MB_BITMAP(e4b);
+ if (order == 0) {
+ *max = 1 << (e4b->bd_blkbits + 3);
+ return e4b->bd_bitmap;
+ }
- bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
return bb;
@@ -727,19 +470,19 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
if (unlikely(e4b->bd_info->bb_bitmap == NULL))
return;
- BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
for (i = 0; i < count; i++) {
if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
ext4_fsblk_t blocknr;
- blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
- blocknr += first + i;
- blocknr +=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
- ext4_error(sb, __FUNCTION__, "double-free of inode"
- " %lu's block %llu(bit %u in group %lu)\n",
- inode ? inode->i_ino : 0, blocknr,
- first + i, e4b->bd_group);
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing block already freed "
+ "(bit %u)",
+ first + i);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -751,7 +494,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
if (unlikely(e4b->bd_info->bb_bitmap == NULL))
return;
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
for (i = 0; i < count; i++) {
BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -767,9 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
b2 = (unsigned char *) bitmap;
for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
if (b1[i] != b2[i]) {
- printk("corruption in group %lu at byte %u(%u):"
- " %x in copy != %x on disk/prealloc\n",
- e4b->bd_group, i, i * 8, b1[i], b2[i]);
+ ext4_msg(e4b->bd_sb, KERN_ERR,
+ "corruption in group %u "
+ "at byte %u(%u): %x in copy != %x "
+ "on disk/prealloc",
+ e4b->bd_group, i, i * 8, b1[i], b2[i]);
BUG();
}
}
@@ -823,9 +568,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy;
void *buddy2;
- if (!test_opt(sb, MBALLOC))
- return 0;
-
{
static int mb_check_counter;
if (mb_check_counter++ % 100 != 0)
@@ -855,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 0 */
+ /* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
for (j = 0; j < (1 << order); j++) {
k = (i * (1 << order)) + j;
MB_CHECK_ASSERT(
- !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+ !mb_test_bit(k, e4b->bd_bitmap));
}
count++;
}
@@ -894,37 +636,41 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
- buddy = mb_find_buddy(e4b, 0, &max);
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
- pa = list_entry(cur, struct ext4_prealloc_space, group_list);
- ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+ pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
MB_CHECK_ASSERT(groupnr == e4b->bd_group);
- for (i = 0; i < pa->len; i++)
+ for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
return 0;
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
- __FILE__, __FUNCTION__, __LINE__)
+ __FILE__, __func__, __LINE__)
#else
#define mb_check_buddy(e4b)
#endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
static void ext4_mb_mark_free_simple(struct super_block *sb,
- void *buddy, unsigned first, int len,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned short min;
- unsigned short max;
- unsigned short chunk;
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
unsigned short border;
- BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
@@ -950,14 +696,37 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
-static void ext4_mb_generate_buddy(struct super_block *sb,
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
- unsigned short i = 0;
- unsigned short first;
- unsigned short len;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
@@ -982,15 +751,21 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
grp->bb_fragments = fragments;
if (free != grp->bb_free) {
- ext4_error(sb, __FUNCTION__,
- "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
- group, free, grp->bb_free);
+ ext4_grp_locked_error(sb, group, 0, 0,
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
+ free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
+ mb_set_largest_free_order(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -1001,6 +776,24 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -1008,7 +801,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
* stored in the inode as
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
@@ -1016,28 +809,34 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
* contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
*/
static int ext4_mb_init_cache(struct page *page, char *incore)
{
+ ext4_group_t ngroups;
int blocksize;
int blocks_per_page;
int groups_per_page;
int err = 0;
int i;
- ext4_group_t first_group;
+ ext4_group_t first_group, group;
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
- struct buffer_head **bh;
+ struct buffer_head **bh = NULL;
struct inode *inode;
char *data;
char *bitmap;
+ struct ext4_group_info *grinfo;
- mb_debug("init page %lu\n", page->index);
+ mb_debug(1, "init page %lu\n", page->index);
inode = page->mapping->host;
sb = inode->i_sb;
+ ngroups = ext4_get_groups_count(sb);
blocksize = 1 << inode->i_blkbits;
blocks_per_page = PAGE_CACHE_SIZE / blocksize;
@@ -1047,67 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
- err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, GFP_NOFS);
- if (bh == NULL)
+ if (bh == NULL) {
+ err = -ENOMEM;
goto out;
+ }
} else
bh = &bhs;
first_group = page->index * blocks_per_page / 2;
/* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
- struct ext4_group_desc *desc;
-
- if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
break;
- err = -EIO;
- desc = ext4_get_group_desc(sb, first_group + i, NULL);
- if (desc == NULL)
- goto out;
-
- err = -ENOMEM;
- bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
- if (bh[i] == NULL)
- goto out;
-
- if (bh_uptodate_or_lock(bh[i]))
- continue;
-
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh[i],
- first_group + i, desc);
- set_buffer_uptodate(bh[i]);
- unlock_buffer(bh[i]);
+ grinfo = ext4_get_group_info(sb, group);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
continue;
}
- get_bh(bh[i]);
- bh[i]->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %lu\n", first_group + i);
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mb_debug(1, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page && bh[i]; i++)
- wait_on_buffer(bh[i]);
-
- err = -EIO;
- for (i = 0; i < groups_per_page && bh[i]; i++)
- if (!buffer_uptodate(bh[i]))
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
goto out;
+ }
+ }
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
- int group;
- struct ext4_group_info *grinfo;
-
group = (first_block + i) >> 1;
- if (group >= EXT4_SB(sb)->s_groups_count)
+ if (group >= ngroups)
break;
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -1124,23 +914,29 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug("put buddy for group %u in page %lu/%x\n",
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
- memset(data, 0xff, blocksize);
+ trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
- sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
/*
* incore got set to the group block bitmap below
*/
+ ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
incore = NULL;
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug("put bitmap for group %u in page %lu/%x\n",
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
+ trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
ext4_lock_group(sb, group);
@@ -1148,6 +944,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
ext4_unlock_group(sb, group);
/* set incore so that the buddy information can be
@@ -1160,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out:
if (bh) {
- for (i = 0; i < groups_per_page && bh[i]; i++)
+ for (i = 0; i < groups_per_page; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -1168,28 +965,175 @@ out:
return err;
}
-static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
- struct ext4_buddy *e4b)
+/*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
+{
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
+ int blocks_per_page;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
+ }
+
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
+}
+
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
+ }
+}
+
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ struct ext4_group_info *this_grp;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
+
+ might_sleep();
+ mb_debug(1, "init group %u\n", group);
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
+ */
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ goto err;
+ }
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (e4b.bd_buddy_page == NULL) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ ret = 0;
+ goto err;
+ }
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+err:
+ ext4_mb_put_buddy_page_lock(&e4b);
+ return ret;
+}
+
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
int blocks_per_page;
int block;
int pnum;
int poff;
struct page *page;
+ int ret;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %lu\n", group);
+ might_sleep();
+ mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
- e4b->bd_info = ext4_get_group_info(sb, group);
+ e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1201,49 +1145,79 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
+ /*
+ * drop the page reference and try
+ * to get the page with lock. If we
+ * are not uptodate that implies
+ * somebody just created the page but
+ * is yet to initialize the same. So
+ * wait for it to initialize.
+ */
page_cache_release(page);
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL) {
+ ret = -ENOMEM;
goto err;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL) {
+ ret = -ENOMEM;
goto err;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1251,16 +1225,18 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
return 0;
err:
+ if (page)
+ page_cache_release(page);
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1274,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
int order = 1;
void *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = EXT4_MB_BUDDY(e4b);
+ bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
@@ -1290,7 +1266,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
return 0;
}
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1303,12 +1279,39 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_clear_bit_atomic(lock, cur, bm);
+ mb_clear_bit(cur, bm);
cur++;
}
}
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1321,23 +1324,100 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_set_bit_atomic(lock, cur, bm);
+ mb_set_bit(cur, bm);
cur++;
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+ int first, int count)
+{
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
- BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+ BUG_ON(last >= (sb->s_blocksize << 3));
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1345,85 +1425,77 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
- if (block && max)
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+ if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u); block bitmap corrupt.",
+ block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
+
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
- else if (!block && !max)
+ else if (!left_is_free && !right_is_free)
e4b->bd_info->bb_fragments++;
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
-
- if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
- ext4_fsblk_t blocknr;
- blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
- blocknr += block;
- blocknr +=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
- ext4_error(sb, __FUNCTION__, "double-free of inode"
- " %lu's block %llu(bit %u in group %lu)\n",
- inode ? inode->i_ino : 0, blocknr, block,
- e4b->bd_group);
- }
- mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
-
- if (!buddy2)
- break;
-
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
+ mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
-
- return 0;
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
- int ord;
+ int max, order;
void *buddy;
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1433,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1450,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
next = (block + 1) * (1 << order);
- if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+ if (mb_test_bit(next, e4b->bd_bitmap))
break;
- ord = mb_find_order_for_block(e4b, next);
+ order = mb_find_order_for_block(e4b, next);
- order = ord;
block = next >> order;
ex->fe_len += 1 << order;
}
@@ -1484,7 +1552,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_mark_used_double(e4b, start, len);
@@ -1494,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain fragments counter */
if (start != 0)
- mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
- max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
if (mlen && max)
e4b->bd_info->bb_fragments++;
else if (!mlen && !max)
@@ -1537,9 +1605,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
}
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
- EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1569,15 +1637,19 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->ac_tail = ret & 0xffff;
ac->ac_buddy = ret >> 16;
- /* XXXXXXX: SUCH A HORRIBLE **CK */
- /*FIXME!! Why ? */
+ /*
+ * take the page reference. We want the page to be pinned
+ * so that we don't get a ext4_mb_init_cache_call for this
+ * group until we update the bitmap. That would mean we
+ * double allocate blocks. The reference is dropped
+ * in ext4_mb_release_context
+ */
ac->ac_bitmap_page = e4b->bd_bitmap_page;
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
-
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1599,6 +1671,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_free_extent ex;
int max;
+ if (ac->ac_status == AC_STATUS_FOUND)
+ return;
/*
* We don't want to scan for a whole year
*/
@@ -1619,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1645,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;
BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
- BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
@@ -1696,7 +1770,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
ext4_mb_check_limits(ac, e4b, 0);
}
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1710,7 +1785,7 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1718,37 +1793,46 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
ext4_group_t group = ac->ac_g_ex.fe_group;
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_super_block *es = sbi->s_es;
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
- start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
- ex.fe_start + le32_to_cpu(es->s_first_data_block);
+ start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+ ex.fe_start;
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
@@ -1773,7 +1857,7 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1782,7 +1866,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
* The routine scans buddy structures (not bitmap!) from given order
* to max order and tries to find big enough chunk to satisfy the req
*/
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
@@ -1825,11 +1910,12 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
* In order to optimize scanning, caller must pass number of
* free blocks in the group, so the routine can know upper limit.
*/
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
@@ -1841,24 +1927,26 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
- EXT4_BLOCKS_PER_GROUP(sb), i);
- if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
* we have free blocks
*/
- ext4_error(sb, __FUNCTION__, "%d free blocks as per "
- "group info. But bitmap says 0\n",
+ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+ "%d free clusters as per "
+ "group info. But bitmap says 0",
free);
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
- ext4_error(sb, __FUNCTION__, "%d free blocks as per "
- "group info. But got %d blocks\n",
+ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+ "%d free clusters as per "
+ "group info. But got %d blocks",
free, ex.fe_len);
/*
* The number of free blocks differs. This mostly
@@ -1867,7 +1955,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1879,15 +1967,15 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
/*
* This is a special case for storages like raid5
- * we try to find stripe-aligned chunks for stripe-size requests
- * XXX should do so at least for multiples of stripe size as well
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
*/
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
@@ -1897,17 +1985,18 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
BUG_ON(sbi->s_stripe == 0);
/* find first stripe-aligned block in group */
- first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
- + le32_to_cpu(sbi->s_es->s_first_data_block);
+ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
a = first_group_block + sbi->s_stripe - 1;
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -1917,37 +2006,54 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+/* This is now called BEFORE we load the buddy bitmap. */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
unsigned free, fragments;
- unsigned i, bits;
- struct ext4_group_desc *desc;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
free = grp->bb_free;
- fragments = grp->bb_fragments;
if (free == 0)
return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }
+
+ fragments = grp->bb_fragments;
if (fragments == 0)
return 0;
switch (cr) {
case 0:
BUG_ON(ac->ac_2order == 0);
- /* If this group is uninitialized, skip it initially */
- desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
+
+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
return 0;
- bits = ac->ac_sb->s_blocksize_bits + 1;
- for (i = ac->ac_2order; i <= bits; i++)
- if (grp->bb_counters[i] > 0)
- return 1;
- break;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
+ return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return 1;
@@ -1965,20 +2071,23 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
-static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t group;
- ext4_group_t i;
+ ext4_group_t ngroups, group, i;
int cr;
int err = 0;
- int bsbits;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
+ ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1999,7 +2108,7 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/*
* We search using buddy data only if the order of the request
* is greater than equal to the sbi_s_mb_order2_reqs
- * You can tune it via /proc/fs/ext4/<partition>/order2_req
+ * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
*/
if (i >= sbi->s_mb_order2_reqs) {
/*
@@ -2009,15 +2118,8 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_2order = i - 1;
}
- bsbits = ac->ac_sb->s_blocksize_bits;
/* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
-
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
@@ -2025,9 +2127,6 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
spin_unlock(&sbi->s_md_lock);
}
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2037,37 +2136,22 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
- struct ext4_group_info *grp;
- struct ext4_group_desc *desc;
-
- if (group == EXT4_SB(sb)->s_groups_count)
- group = 0;
-
- /* quick check to skip empty groups */
- grp = ext4_get_group_info(ac->ac_sb, group);
- if (grp->bb_free == 0)
- continue;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+ for (i = 0; i < ngroups; group++, i++) {
+ cond_resched();
/*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
*/
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
- goto out;
- ext4_mb_release_desc(&e4b);
- }
+ if (group >= ngroups)
+ group = 0;
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
+ /* This now checks without needing the buddy page */
if (!ext4_mb_good_group(ac, group, cr))
continue;
@@ -2076,27 +2160,28 @@ repeat:
goto out;
ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
if (!ext4_mb_good_group(ac, group, cr)) {
- /* someone did allocation from this group */
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
ac->ac_groups_scanned++;
- desc = ext4_get_group_desc(sb, group, NULL);
- if (cr == 0 || (desc->bg_flags &
- cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
- ac->ac_2order != 0))
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
- else if (cr == 1 &&
- ac->ac_g_ex.fe_len == sbi->s_stripe)
+ else if (cr == 1 && sbi->s_stripe &&
+ !(ac->ac_g_ex.fe_len % sbi->s_stripe))
ext4_mb_scan_aligned(ac, &e4b);
else
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
@@ -2132,241 +2217,40 @@ out:
return err;
}
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
- struct ext4_mb_history *history;
- struct super_block *sb;
- int start;
- int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
- struct ext4_mb_history *hs,
- int first)
-{
- if (hs == s->history + s->max)
- hs = s->history;
- if (!first && hs == s->history + s->start)
- return NULL;
- while (hs->orig.fe_len == 0) {
- hs++;
- if (hs == s->history + s->max)
- hs = s->history;
- if (hs == s->history + s->start)
- return NULL;
- }
- return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- if (!hs)
- return NULL;
- while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
- return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
- loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- else
- return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
- char buf[25], buf2[25], buf3[25], *fmt;
- struct ext4_mb_history *hs = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-5s %-5s %-5s %-6s\n",
- "pid", "inode", "original", "goal", "result", "found",
- "grps", "cr", "flags", "merge", "tail", "broken");
- return 0;
- }
-
- if (hs->op == EXT4_MB_HISTORY_ALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "%-5u %-5s %-5u %-6u\n";
- sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
- hs->goal.fe_start, hs->goal.fe_len,
- hs->goal.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
- hs->found, hs->groups, hs->cr, hs->flags,
- hs->merged ? "M" : "", hs->tail,
- hs->buddy ? 1 << hs->buddy : 0);
- } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s\n";
- sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
- } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
- sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s discard\n",
- hs->pid, hs->ino, buf2);
- } else if (hs->op == EXT4_MB_HISTORY_FREE) {
- sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s free\n",
- hs->pid, hs->ino, buf2);
- }
- return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static struct seq_operations ext4_mb_seq_history_ops = {
- .start = ext4_mb_seq_history_start,
- .next = ext4_mb_seq_history_next,
- .stop = ext4_mb_seq_history_stop,
- .show = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
- struct super_block *sb = PDE(inode)->data;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_mb_proc_session *s;
- int rc;
- int size;
-
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- s->sb = sb;
- size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
- s->history = kmalloc(size, GFP_KERNEL);
- if (s->history == NULL) {
- kfree(s);
- return -ENOMEM;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(s->history, sbi->s_mb_history, size);
- s->max = sbi->s_mb_history_max;
- s->start = sbi->s_mb_history_cur % s->max;
- spin_unlock(&sbi->s_mb_history_lock);
-
- rc = seq_open(file, &ext4_mb_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = s;
- } else {
- kfree(s->history);
- kfree(s);
- }
- return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- kfree(s->history);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- struct super_block *sb = s->sb;
- char str[32];
- int value;
-
- if (count >= sizeof(str)) {
- printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
- "mb_history", (int)sizeof(str));
- return -EOVERFLOW;
- }
-
- if (copy_from_user(str, buffer, count))
- return -EFAULT;
-
- value = simple_strtol(str, NULL, 0);
- if (value < 0)
- return -ERANGE;
- EXT4_SB(sb)->s_mb_history_filter = value;
-
- return count;
-}
-
-static struct file_operations ext4_mb_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = ext4_mb_seq_history_open,
- .read = seq_read,
- .write = ext4_mb_seq_history_write,
- .llseek = seq_lseek,
- .release = ext4_mb_seq_history_release,
-};
-
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
- if (*pos < 0 || *pos >= sbi->s_groups_count)
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
return NULL;
-
group = *pos + 1;
- return (void *) group;
+ return (void *) ((unsigned long) group);
}
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct super_block *sb = seq->private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
++*pos;
- if (*pos < 0 || *pos >= sbi->s_groups_count)
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
return NULL;
group = *pos + 1;
- return (void *) group;;
+ return (void *) ((unsigned long) group);
}
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
struct super_block *sb = seq->private;
- long group = (long) v;
+ ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
- int err;
+ int err, buddy_loaded = 0;
struct ext4_buddy e4b;
+ struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
- unsigned short counters[16];
+ ext4_grpblk_t counters[16];
} sg;
group--;
@@ -2380,17 +2264,23 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err) {
- seq_printf(seq, "#%-5lu: I/O error\n", group);
- return 0;
+ grinfo = ext4_get_group_info(sb, group);
+ /* Load the group info in memory only if not already loaded. */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ buddy_loaded = 1;
}
- ext4_lock_group(sb, group);
+
memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
- seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+ if (buddy_loaded)
+ ext4_mb_unload_buddy(&e4b);
+
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2404,7 +2294,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
@@ -2413,19 +2303,19 @@ static struct seq_operations ext4_mb_seq_groups_ops = {
static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
{
- struct super_block *sb = PDE(inode)->data;
+ struct super_block *sb = PDE_DATA(inode);
int rc;
rc = seq_open(file, &ext4_mb_seq_groups_ops);
if (rc == 0) {
- struct seq_file *m = (struct seq_file *)file->private_data;
+ struct seq_file *m = file->private_data;
m->private = sb;
}
return rc;
}
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2433,229 +2323,257 @@ static struct file_operations ext4_mb_seq_groups_fops = {
.release = seq_release,
};
-static void ext4_mb_history_release(struct super_block *sb)
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+ BUG_ON(!cachep);
+ return cachep;
+}
+
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
- remove_proc_entry("mb_groups", sbi->s_mb_proc);
- remove_proc_entry("mb_history", sbi->s_mb_proc);
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
- kfree(sbi->s_mb_history);
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
}
-static void ext4_mb_history_init(struct super_block *sb)
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+ struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- if (sbi->s_mb_proc != NULL) {
- struct proc_dir_entry *p;
- p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
- if (p) {
- p->proc_fops = &ext4_mb_seq_history_fops;
- p->data = sb;
- }
- p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
- if (p) {
- p->proc_fops = &ext4_mb_seq_groups_fops;
- p->data = sb;
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
+ "for a buddy group");
+ goto exit_meta_group_info;
}
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
}
- sbi->s_mb_history_max = 1000;
- sbi->s_mb_history_cur = 0;
- spin_lock_init(&sbi->s_mb_history_lock);
- i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
- /* if we can't allocate history, then we simple won't use it */
-}
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-static void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_mb_history h;
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
- if (unlikely(sbi->s_mb_history == NULL))
- return;
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_clusters_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ ext4_free_group_clusters(sb, desc);
+ }
- if (!(ac->ac_op & sbi->s_mb_history_filter))
- return;
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root = RB_ROOT;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
- h.op = ac->ac_op;
- h.pid = current->pid;
- h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
- h.orig = ac->ac_o_ex;
- h.result = ac->ac_b_ex;
- h.flags = ac->ac_flags;
- h.found = ac->ac_found;
- h.groups = ac->ac_groups_scanned;
- h.cr = ac->ac_criteria;
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
- h.merged = 0;
- if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
- if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
- ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
- h.merged = 1;
- h.goal = ac->ac_g_ex;
- h.result = ac->ac_f_ex;
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
}
+#endif
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
- if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
- sbi->s_mb_history_cur = 0;
- spin_unlock(&sbi->s_mb_history_lock);
-}
+ return 0;
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ }
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
static int ext4_mb_init_backend(struct super_block *sb)
{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int j, len, metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
- struct ext4_group_info **meta_group_info;
+ int err;
+ struct ext4_group_desc *desc;
+ struct kmem_cache *cachep;
+
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
- printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+ ext4_msg(sb, KERN_ERR, "can't get new inode");
goto err_freesgi;
}
+ /* To avoid potentially colliding with an valid on-disk inode number,
+ * use EXT4_BAD_INO for the buddy cache inode number. This inode is
+ * not in the inode hash, so it should never be found by iget(), but
+ * this will avoid confusion if it ever shows up during debugging. */
+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (sbi->s_groups_count -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
- for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- i--;
- goto err_freebuddy;
- }
+ for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
- printk(KERN_ERR
- "EXT4-fs: can't read descriptor %lu\n", i);
+ ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
err_freebuddy:
- while (i >= 0) {
- kfree(ext4_get_group_info(sb, i));
- i--;
- }
- i = num_meta_group_infos;
-err_freemeta:
- while (--i >= 0)
+ cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ while (i-- > 0)
+ kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ i = sbi->s_group_info_size;
+ while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
err_freesgi:
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
return -ENOMEM;
}
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+static void ext4_groupinfo_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+ if (ext4_groupinfo_caches[i])
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ ext4_groupinfo_caches[i] = NULL;
+ }
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+ static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+ int slab_size;
+ int blocksize_bits = order_base_2(size);
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep;
+
+ if (cache_index >= NR_GRPINFO_CACHES)
+ return -EINVAL;
+
+ if (unlikely(cache_index < 0))
+ cache_index = 0;
+
+ mutex_lock(&ext4_grpinfo_slab_create_mutex);
+ if (ext4_groupinfo_caches[cache_index]) {
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ return 0; /* Already created */
+ }
+
+ slab_size = offsetof(struct ext4_group_info,
+ bb_counters[blocksize_bits + 2]);
+
+ cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+ slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+ NULL);
+
+ ext4_groupinfo_caches[cache_index] = cachep;
+
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ if (!cachep) {
+ printk(KERN_EMERG
+ "EXT4-fs: no memory for groupinfo slab cache\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned i;
+ unsigned i, j;
unsigned offset;
unsigned max;
+ int ret;
- if (!test_opt(sb, MBALLOC))
- return 0;
-
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
- kfree(sbi->s_mb_maxs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+ ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+ if (ret < 0)
+ goto out;
+
/* order 0 is regular bitmap */
sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
sbi->s_mb_offsets[0] = 0;
@@ -2671,19 +2589,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i++;
} while (i <= sb->s_blocksize_bits + 1);
- /* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- return i;
- }
-
spin_lock_init(&sbi->s_md_lock);
- INIT_LIST_HEAD(&sbi->s_active_transaction);
- INIT_LIST_HEAD(&sbi->s_closed_transaction);
- INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2691,33 +2597,70 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
+ /*
+ * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+ * to the lowest multiple of s_stripe which is bigger than
+ * the s_mb_group_prealloc as determined above. We want
+ * the preallocation size to be an exact multiple of the
+ * RAID stripe size so that preallocations don't fragment
+ * the stripes.
+ */
+ if (sbi->s_stripe > 1) {
+ sbi->s_mb_group_prealloc = roundup(
+ sbi->s_mb_group_prealloc, sbi->s_stripe);
+ }
- i = sizeof(struct ext4_locality_group) * NR_CPUS;
- sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
- lg = &sbi->s_locality_groups[i];
+ lg = per_cpu_ptr(sbi->s_locality_groups, i);
mutex_init(&lg->lg_mutex);
- INIT_LIST_HEAD(&lg->lg_prealloc_list);
+ for (j = 0; j < PREALLOC_TB_SIZE; j++)
+ INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_init_per_dev_proc(sb);
- ext4_mb_history_init(sb);
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+ goto out_free_locality_groups;
+
+ if (sbi->s_proc)
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
- printk("EXT4-fs: mballoc enabled\n");
return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
+out:
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
+ return ret;
}
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
@@ -2728,34 +2671,27 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
- kfree(pa);
+ kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
- mb_debug("mballoc: %u PAs left\n", count);
+ mb_debug(1, "mballoc: %u PAs left\n", count);
}
int ext4_mb_release(struct super_block *sb)
{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int num_meta_group_infos;
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- if (!test_opt(sb, MBALLOC))
- return 0;
-
- /* release freed, non-committed blocks */
- spin_lock(&sbi->s_md_lock);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_committed_transaction);
- spin_unlock(&sbi->s_md_lock);
- ext4_mb_free_committed_blocks(sb);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
if (sbi->s_group_info) {
- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
#ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);
@@ -2763,299 +2699,192 @@ int ext4_mb_release(struct super_block *sb)
ext4_lock_group(sb, i);
ext4_mb_cleanup_pa(grinfo);
ext4_unlock_group(sb, i);
- kfree(grinfo);
+ kmem_cache_free(cachep, grinfo);
}
- num_meta_group_infos = (sbi->s_groups_count +
+ num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
kfree(sbi->s_group_info[i]);
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
if (sbi->s_buddy_cache)
iput(sbi->s_buddy_cache);
if (sbi->s_mb_stats) {
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u blocks %u reqs (%u success)",
atomic_read(&sbi->s_bal_allocated),
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
- "%u 2^N hits, %u breaks, %u lost\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u extents scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
- sbi->s_mb_buddies_generated++,
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %lu generated and it took %Lu",
+ sbi->s_mb_buddies_generated,
sbi->s_mb_generation_time);
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
atomic_read(&sbi->s_mb_discarded));
}
- kfree(sbi->s_locality_groups);
-
- ext4_mb_history_release(sb);
- ext4_mb_destroy_per_dev_proc(sb);
+ free_percpu(sbi->s_locality_groups);
return 0;
}
-static void ext4_mb_free_committed_blocks(struct super_block *sb)
+static inline int ext4_issue_discard(struct super_block *sb,
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
- struct ext4_buddy e4b;
-
- if (list_empty(&sbi->s_committed_transaction))
- return;
-
- /* there is committed blocks to be freed yet */
- do {
- /* get next array of blocks */
- md = NULL;
- spin_lock(&sbi->s_md_lock);
- if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
- }
- spin_unlock(&sbi->s_md_lock);
-
- if (md == NULL)
- break;
-
- mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
-
- err = ext4_mb_load_buddy(sb, md->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);
-
- /* there are blocks to put in buddy to make them really free */
- count += md->num;
- count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
- }
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
-
- kfree(md);
- ext4_mb_release_desc(&e4b);
-
- } while (md);
-
- mb_debug("freed %u blocks in %u structures\n", count, count2);
-}
-
-#define EXT4_ROOT "ext4"
-#define EXT4_MB_STATS_NAME "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
-#define EXT4_MB_ORDER2_REQ "order2_req"
-#define EXT4_MB_STREAM_REQ "stream_req"
-#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
-
-
-
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
-{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
+ ext4_fsblk_t discard_block;
+
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long) discard_block, count);
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
-{ \
- struct ext4_sb_info *sbi = data; \
- char str[32]; \
- long value; \
- if (cnt >= sizeof(str)) \
- return -EINVAL; \
- if (copy_from_user(str, buf, cnt)) \
- return -EFAULT; \
- value = simple_strtol(str, NULL, 0); \
- if (value <= 0) \
- return -ERANGE; \
- sbi->s_mb_##name = value; \
- return cnt; \
-}
-
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
-
-#define MB_PROC_HANDLER(name, var) \
-do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
- if (proc == NULL) { \
- printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
- goto err_out; \
- } \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
-} while (0)
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct proc_dir_entry *proc;
- char devname[64];
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
+ struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
+
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
- snprintf(devname, sizeof(devname) - 1, "%s",
- bdevname(sb->s_bdev, devname));
- sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
- MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
- MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
- MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
- MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
- MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
- MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
- return 0;
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
-err_out:
- printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
- remove_proc_entry(devname, proc_root_ext4);
- sbi->s_mb_proc = NULL;
-
- return -ENOMEM;
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- char devname[64];
-
- if (sbi->s_mb_proc == NULL)
- return -EINVAL;
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
- snprintf(devname, sizeof(devname) - 1, "%s",
- bdevname(sb->s_bdev, devname));
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
- remove_proc_entry(devname, proc_root_ext4);
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
- return 0;
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
{
- ext4_pspace_cachep =
- kmem_cache_create("ext4_prealloc_space",
- sizeof(struct ext4_prealloc_space),
- 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+ SLAB_RECLAIM_ACCOUNT);
if (ext4_pspace_cachep == NULL)
return -ENOMEM;
- ext4_ac_cachep =
- kmem_cache_create("ext4_alloc_context",
- sizeof(struct ext4_allocation_context),
- 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+ SLAB_RECLAIM_ACCOUNT);
if (ext4_ac_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
-#ifdef CONFIG_PROC_FS
- proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
- if (proc_root_ext4 == NULL)
- printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
-#endif
+
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
{
- /* XXX: synchronize_rcu(); */
+ /*
+ * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+ * before destroying the slab cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(EXT4_ROOT, proc_root_fs);
-#endif
+ kmem_cache_destroy(ext4_free_data_cachep);
+ ext4_groupinfo_destroy_slabs();
}
/*
- * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
* Returns 0 if success or error code
*/
-static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle)
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
- struct ext4_super_block *es;
struct ext4_group_desc *gdp;
struct buffer_head *gdp_bh;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block;
- int err;
+ int err, len;
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(ac->ac_b_ex.fe_len <= 0);
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- es = sbi->s_es;
-
- ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
- gdp->bg_free_blocks_count);
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -3065,23 +2894,35 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!gdp)
goto out_err;
+ ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
+ ext4_free_group_clusters(sb, gdp));
+
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
- block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
- + ac->ac_b_ex.fe_start
- + le32_to_cpu(es->s_first_data_block);
-
- if (block == ext4_block_bitmap(sb, gdp) ||
- block == ext4_inode_bitmap(sb, gdp) ||
- in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- ext4_error(sb, __FUNCTION__,
- "Allocating block in system zone - block = %llu",
- block);
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (!ext4_data_block_valid(sbi, block, len)) {
+ ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
+ "fs metadata", block, block+len);
+ /* File system mounted not to panic on error
+ * Fix the bitmap and repeat the block allocation
+ * We leak some of the blocks here.
+ */
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!err)
+ err = -EAGAIN;
+ goto out_err;
}
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
{
int i;
@@ -3091,40 +2932,52 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
- ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
- spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- gdp->bg_free_blocks_count =
- cpu_to_le16(ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group,
- gdp));
- }
- gdp->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
- - ac->ac_b_ex.fe_len);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
-
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
+ }
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
+ /*
+ * Now reduce the dirty block count also. Should not go negative
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ atomic64_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (err)
goto out_err;
- err = ext4_journal_dirty_metadata(handle, gdp_bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
- sb->s_dirt = 1;
brelse(bitmap_bh);
return err;
}
/*
* here we normalize request for locality group
- * Group request are normalized to s_strip size if we set the same via mount
- * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
*
* XXX: should we try to preallocate more than the group has now?
*/
@@ -3134,11 +2987,8 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg = ac->ac_lg;
BUG_ON(lg == NULL);
- if (EXT4_SB(sb)->s_stripe)
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
- else
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug("#%u: goal %lu blocks for locality group\n",
+ ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3146,15 +2996,18 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
* Normalization means making request better in terms of
* size and alignment
*/
-static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
- struct list_head *cur;
- loff_t size, orig_size, start_off;
- ext4_lblk_t start, orig_start;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
+ ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_prealloc_space *pa;
/* do normalize only data requests, metadata requests
do not need preallocation */
@@ -3179,17 +3032,17 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
+ orig_size = size;
- /* max available blocks in a free group */
- max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
- EXT4_SB(ac->ac_sb)->s_itb_per_group;
+ /* max size of free chunks */
+ max = 2 << bsbits;
-#define NRL_CHECK_SIZE(req, size, max,bits) \
- (req <= (size) || max <= ((size) >> bits))
+#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
+ (req <= (size) || max <= (chunk_size))
/* first, try to predict filesize */
/* XXX: should this table be tunable? */
@@ -3208,16 +3061,16 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = 512 * 1024;
} else if (size <= 1024 * 1024) {
size = 1024 * 1024;
- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+ } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
- (20 - bsbits)) << 20;
- size = 1024 * 1024;
- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+ (21 - bsbits)) << 21;
+ size = 2 * 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(22 - bsbits)) << 22;
size = 4 * 1024 * 1024;
} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
- (8<<20)>>bsbits, max, bsbits)) {
+ (8<<20)>>bsbits, max, 8 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(23 - bsbits)) << 23;
size = 8 * 1024 * 1024;
@@ -3225,8 +3078,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
size = ac->ac_o_ex.fe_len << bsbits;
}
- orig_size = size = size >> bsbits;
- orig_start = start = start_off >> bsbits;
+ size = size >> bsbits;
+ start = start_off >> bsbits;
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
@@ -3240,11 +3093,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* check we don't cross already preallocated blocks */
rcu_read_lock();
- list_for_each_rcu(cur, &ei->i_prealloc_list) {
- struct ext4_prealloc_space *pa;
- unsigned long pa_end;
-
- pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ ext4_lblk_t pa_end;
if (pa->pa_deleted)
continue;
@@ -3254,29 +3104,25 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
ac->ac_o_ex.fe_logical < pa->pa_lstart));
- /* skip PA normalized request doesn't overlap with */
- if (pa->pa_lstart >= end) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- if (pa_end <= start) {
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
spin_unlock(&pa->pa_lock);
continue;
}
BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+ /* adjust start or end to be adjacent to this pa */
if (pa_end <= ac->ac_o_ex.fe_logical) {
BUG_ON(pa_end < start);
start = pa_end;
- }
-
- if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
BUG_ON(pa->pa_lstart > end);
end = pa->pa_lstart;
}
@@ -3287,13 +3133,13 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: extra loop to check we really don't overlap preallocations */
rcu_read_lock();
- list_for_each_rcu(cur, &ei->i_prealloc_list) {
- struct ext4_prealloc_space *pa;
- unsigned long pa_end;
- pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3302,20 +3148,21 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
if (start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical) {
- printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
- (unsigned long) start, (unsigned long) size,
- (unsigned long) ac->ac_o_ex.fe_logical);
+ ext4_msg(ac->ac_sb, KERN_ERR,
+ "start %lu, size %lu, fe_logical %lu",
+ (unsigned long) start, (unsigned long) size,
+ (unsigned long) ac->ac_o_ex.fe_logical);
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
- BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
/* now prepare goal request */
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3333,7 +3180,7 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3344,7 +3191,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
- if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+ if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3354,7 +3201,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_inc(&sbi->s_bal_breaks);
}
- ext4_mb_store_history(ac);
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+ trace_ext4_mballoc_alloc(ac);
+ else
+ trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context. We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ if (pa && pa->pa_type == MB_INODE_PA)
+ pa->pa_free += ac->ac_b_ex.fe_len;
}
/*
@@ -3363,14 +3227,16 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3378,11 +3244,11 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
}
/*
@@ -3391,7 +3257,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
- unsigned len = ac->ac_o_ex.fe_len;
+ unsigned int len = ac->ac_o_ex.fe_len;
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group,
@@ -3406,18 +3272,50 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+ struct ext4_prealloc_space *pa,
+ struct ext4_prealloc_space *cpa)
+{
+ ext4_fsblk_t cur_distance, new_distance;
+
+ if (cpa == NULL) {
+ atomic_inc(&pa->pa_count);
+ return pa;
+ }
+ cur_distance = abs(goal_block - cpa->pa_pstart);
+ new_distance = abs(goal_block - pa->pa_pstart);
+
+ if (cur_distance <= new_distance)
+ return cpa;
+
+ /* drop the previous reference */
+ atomic_dec(&cpa->pa_count);
+ atomic_inc(&pa->pa_count);
+ return pa;
}
/*
* search goal blocks in preallocated space
*/
-static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *pa;
- struct list_head *cur;
+ struct ext4_prealloc_space *pa, *cpa = NULL;
+ ext4_fsblk_t goal_block;
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3425,13 +3323,19 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* first, try per-file preallocation */
rcu_read_lock();
- list_for_each_rcu(cur, &ei->i_prealloc_list) {
- pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
+ continue;
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3456,32 +3360,70 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
lg = ac->ac_lg;
if (lg == NULL)
return 0;
+ order = fls(ac->ac_o_ex.fe_len) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
- rcu_read_lock();
- list_for_each_rcu(cur, &lg->lg_prealloc_list) {
- pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
- atomic_inc(&pa->pa_count);
- ext4_mb_use_group_pa(ac, pa);
+ goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
+ /*
+ * search for the prealloc space that is having
+ * minimal distance from the goal block.
+ */
+ for (i = order; i < PREALLOC_TB_SIZE; i++) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 &&
+ pa->pa_free >= ac->ac_o_ex.fe_len) {
+
+ cpa = ext4_mb_check_group_pa(goal_block,
+ pa, cpa);
+ }
spin_unlock(&pa->pa_lock);
- ac->ac_criteria = 20;
- rcu_read_unlock();
- return 1;
}
- spin_unlock(&pa->pa_lock);
+ rcu_read_unlock();
+ }
+ if (cpa) {
+ ext4_mb_use_group_pa(ac, cpa);
+ ac->ac_criteria = 20;
+ return 1;
}
- rcu_read_unlock();
-
return 0;
}
/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with the ext4 group lock held
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+{
+ struct rb_node *n;
+ struct ext4_group_info *grp;
+ struct ext4_free_data *entry;
+
+ grp = ext4_get_group_info(sb, group);
+ n = rb_first(&(grp->bb_free_root));
+
+ while (n) {
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ n = rb_next(n);
+ }
+ return;
+}
+
+/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
*/
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3490,7 +3432,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t groupnr;
ext4_grpblk_t start;
int preallocated = 0;
- int count = 0;
int len;
/* all form of preallocation discards first load group,
@@ -3511,18 +3452,19 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
- bitmap, start, len);
+ ext4_set_bits(bitmap, start, len);
preallocated += len;
- count++;
}
- mb_debug("prellocated %u for group %lu\n", preallocated, group);
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3533,13 +3475,16 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
struct super_block *sb, struct ext4_prealloc_space *pa)
{
- unsigned long grp;
-
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
+ ext4_group_t grp;
+ ext4_fsblk_t grp_blk;
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -3548,8 +3493,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
pa->pa_deleted = 1;
spin_unlock(&pa->pa_lock);
- /* -1 is to protect from crossing allocation group */
- ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+ grp_blk = pa->pa_pstart;
+ /*
+ * If doing group-based preallocation, pa_pstart may be in the
+ * next group when pa is used up
+ */
+ if (pa->pa_type == MB_GROUP_PA)
+ grp_blk--;
+
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3579,9 +3531,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
/*
* creates new preallocated space for given inode
*/
-static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3613,16 +3567,18 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3637,14 +3593,17 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 0;
+ pa->pa_type = MB_INODE_PA;
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3666,7 +3625,8 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
/*
* creates new preallocated space for locality group inodes belongs to
*/
-static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
struct ext4_locality_group *lg;
@@ -3693,11 +3653,14 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 1;
+ pa->pa_type = MB_GROUP_PA;
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_group_pa(ac, pa);
ext4_mb_use_group_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3713,10 +3676,10 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- spin_lock(pa->pa_obj_lock);
- list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
- spin_unlock(pa->pa_obj_lock);
-
+ /*
+ * We will later add the new pa to the right bucket
+ * after updating the pa_free in ext4_mb_release_context
+ */
return 0;
}
@@ -3739,107 +3702,76 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* the caller MUST hold group/inode locks.
* TODO: optimize the case when there are no in-core structures yet
*/
-static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
- struct buffer_head *bitmap_bh,
- struct ext4_prealloc_space *pa)
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+ struct ext4_prealloc_space *pa)
{
- struct ext4_allocation_context *ac;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned long end;
- unsigned long next;
+ unsigned int end;
+ unsigned int next;
ext4_group_t group;
ext4_grpblk_t bit;
- sector_t start;
+ unsigned long long grp_blk_start;
int err = 0;
int free = 0;
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
- if (ac) {
- ac->ac_sb = sb;
- ac->ac_inode = pa->pa_inode;
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
- }
-
while (bit < end) {
bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
- start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
- le32_to_cpu(sbi->s_es->s_first_data_block);
- mb_debug(" free preallocated %u/%u in group %u\n",
- (unsigned) start, (unsigned) next - bit,
- (unsigned) group);
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
+ (unsigned) ext4_group_first_block_no(sb, group) + bit,
+ (unsigned) next - bit, (unsigned) group);
free += next - bit;
- if (ac) {
- ac->ac_b_ex.fe_group = group;
- ac->ac_b_ex.fe_start = bit;
- ac->ac_b_ex.fe_len = next - bit;
- ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
- }
-
+ trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
+ next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
if (free != pa->pa_free) {
- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
- pa, (unsigned long) pa->pa_lstart,
- (unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
- ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
- free, pa->pa_free);
+ ext4_msg(e4b->bd_sb, KERN_CRIT,
+ "pa %p: logic %lu, phys. %lu, len %lu",
+ pa, (unsigned long) pa->pa_lstart,
+ (unsigned long) pa->pa_pstart,
+ (unsigned long) pa->pa_len);
+ ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+ free, pa->pa_free);
/*
* pa is already deleted so we use the value obtained
* from the bitmap and continue.
*/
}
atomic_add(free, &sbi->s_mb_discarded);
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
return err;
}
-static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa)
{
- struct ext4_allocation_context *ac;
struct super_block *sb = e4b->bd_sb;
ext4_group_t group;
ext4_grpblk_t bit;
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
- if (ac)
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
+ trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-
- if (ac) {
- ac->ac_sb = sb;
- ac->ac_inode = NULL;
- ac->ac_b_ex.fe_group = group;
- ac->ac_b_ex.fe_start = bit;
- ac->ac_b_ex.fe_len = pa->pa_len;
- ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
- kmem_cache_free(ext4_ac_cachep, ac);
- }
+ trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
return 0;
}
@@ -3853,7 +3785,8 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
* - how many do we discard
* 1) how many requested
*/
-static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
ext4_group_t group, int needed)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3865,27 +3798,28 @@ static int ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %lu\n", group);
+ mb_debug(1, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- /* error handling here */
- ext4_mb_release_desc(&e4b);
- BUG_ON(bitmap_bh == NULL);
+ ext4_error(sb, "Error reading block bitmap for %u", group);
+ return 0;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
- BUG_ON(err != 0); /* error handling here */
+ if (err) {
+ ext4_error(sb, "Error loading buddy information for %u", group);
+ put_bh(bitmap_bh);
+ return 0;
+ }
if (needed == 0)
- needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
- grp = ext4_get_group_info(sb, group);
INIT_LIST_HEAD(&list);
-
repeat:
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
@@ -3917,11 +3851,7 @@ repeat:
if (free < needed && busy) {
busy = 0;
ext4_unlock_group(sb, group);
- /*
- * Yield the CPU here so that we don't get soft lockup
- * in non preempt case.
- */
- yield();
+ cond_resched();
goto repeat;
}
@@ -3939,7 +3869,7 @@ repeat:
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
- if (pa->pa_linear)
+ if (pa->pa_type == MB_GROUP_PA)
ext4_mb_release_group_pa(&e4b, pa);
else
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
@@ -3950,7 +3880,7 @@ repeat:
out:
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
return free;
}
@@ -3964,7 +3894,7 @@ out:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -3975,12 +3905,13 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
struct ext4_buddy e4b;
int err;
- if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+ if (!S_ISREG(inode->i_mode)) {
/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
return;
}
- mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+ trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
@@ -3997,7 +3928,8 @@ repeat:
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
spin_unlock(&ei->i_prealloc_lock);
- printk(KERN_ERR "uh-oh! used pa while discarding\n");
+ ext4_msg(sb, KERN_ERR,
+ "uh-oh! used pa while discarding");
WARN_ON(1);
schedule_timeout_uninterruptible(HZ);
goto repeat;
@@ -4033,17 +3965,22 @@ repeat:
spin_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
- BUG_ON(pa->pa_linear != 0);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ BUG_ON(pa->pa_type != MB_INODE_PA);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
- BUG_ON(err != 0); /* error handling here */
+ if (err) {
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
+ continue;
+ }
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- /* error handling here */
- ext4_mb_release_desc(&e4b);
- BUG_ON(bitmap_bh == NULL);
+ ext4_error(sb, "Error reading block bitmap for %u",
+ group);
+ ext4_mb_unload_buddy(&e4b);
+ continue;
}
ext4_lock_group(sb, group);
@@ -4051,7 +3988,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
@@ -4059,31 +3996,23 @@ repeat:
}
}
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b,
- sector_t block, int count)
-{
- BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t i;
+ ext4_group_t ngroups, i;
+
+ if (!ext4_mballoc_debug ||
+ (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+ return;
- printk(KERN_ERR "EXT4-fs: Can't allocate:"
- " Allocation context details:\n");
- printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ " Allocation context details:");
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
ac->ac_status, ac->ac_flags);
- printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
- "best %lu/%lu/%lu@%lu cr %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
+ "best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
(unsigned long)ac->ac_o_ex.fe_len,
@@ -4097,10 +4026,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
- ac->ac_found);
- printk(KERN_ERR "EXT4-fs: groups: \n");
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
+ ngroups = ext4_get_groups_count(sb);
+ for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
@@ -4113,14 +4042,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
NULL, &start);
spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%lu:%d:%u \n", i,
- start, pa->pa_len);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
}
- ext4_lock_group(sb, i);
+ ext4_unlock_group(sb, i);
if (grp->bb_free == 0)
continue;
- printk(KERN_ERR "%lu: %d/%d \n",
+ printk(KERN_ERR "%u: %d/%d \n",
i, grp->bb_free, grp->bb_fragments);
}
printk(KERN_ERR "\n");
@@ -4137,7 +4066,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
* file is determined by the current size or the resulting size after
* allocation which ever is larger
*
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
*/
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
@@ -4148,16 +4077,31 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- size = max(size, isize);
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
+
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
+
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+ if (size > sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4165,8 +4109,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
- ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
- put_cpu();
+ ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
/* we're going to use group allocation */
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4175,23 +4118,24 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
mutex_lock(&ac->ac_lg->lg_mutex);
}
-static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
struct super_block *sb = ar->inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_group_t group;
- unsigned long len;
- unsigned long goal;
+ unsigned int len;
+ ext4_fsblk_t goal;
ext4_grpblk_t block;
/* we can't allocate > group size */
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
- len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4201,38 +4145,22 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- ac->ac_b_ex.fe_logical = ar->logical;
- ac->ac_b_ex.fe_group = 0;
- ac->ac_b_ex.fe_start = 0;
- ac->ac_b_ex.fe_len = 0;
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
- ac->ac_groups_scanned = 0;
- ac->ac_ex_scanned = 0;
- ac->ac_found = 0;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
- ac->ac_f_ex.fe_len = 0;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- ac->ac_2order = 0;
- ac->ac_criteria = 0;
- ac->ac_pa = NULL;
- ac->ac_bitmap_page = NULL;
- ac->ac_buddy_page = NULL;
- ac->ac_lg = NULL;
/* we have to define context: we'll we work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4243,22 +4171,167 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
}
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+ struct ext4_locality_group *lg,
+ int order, int total_entries)
+{
+ ext4_group_t group = 0;
+ struct ext4_buddy e4b;
+ struct list_head discard_list;
+ struct ext4_prealloc_space *pa, *tmp;
+
+ mb_debug(1, "discard locality group preallocation\n");
+
+ INIT_LIST_HEAD(&discard_list);
+
+ spin_lock(&lg->lg_prealloc_lock);
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ /*
+ * This is the pa that we just used
+ * for block allocation. So don't
+ * free that
+ */
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ if (pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ /* only lg prealloc space */
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+ /* seems this one can be freed ... */
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &discard_list);
+
+ total_entries--;
+ if (total_entries <= 5) {
+ /*
+ * we want to keep only 5 entries
+ * allowing it to grow to 8. This
+ * mak sure we don't call discard
+ * soon for this list.
+ */
+ break;
+ }
+ }
+ spin_unlock(&lg->lg_prealloc_lock);
+
+ list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+ group = ext4_get_group_number(sb, pa->pa_pstart);
+ if (ext4_mb_load_buddy(sb, group, &e4b)) {
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
+ continue;
+ }
+ ext4_lock_group(sb, group);
+ list_del(&pa->pa_group_list);
+ ext4_mb_release_group_pa(&e4b, pa);
+ ext4_unlock_group(sb, group);
+
+ ext4_mb_unload_buddy(&e4b);
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+ int order, added = 0, lg_prealloc_count = 1;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_locality_group *lg = ac->ac_lg;
+ struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+ order = fls(pa->pa_free) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
+ /* Add the prealloc space to lg */
+ spin_lock(&lg->lg_prealloc_lock);
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted) {
+ spin_unlock(&tmp_pa->pa_lock);
+ continue;
+ }
+ if (!added && pa->pa_free < tmp_pa->pa_free) {
+ /* Add to the tail of the previous entry */
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &tmp_pa->pa_inode_list);
+ added = 1;
+ /*
+ * we want to count the total
+ * number of entries in the list
+ */
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ lg_prealloc_count++;
+ }
+ if (!added)
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &lg->lg_prealloc_list[order]);
+ spin_unlock(&lg->lg_prealloc_lock);
+
+ /* Now trim the list to be not more than 8 elements */
+ if (lg_prealloc_count > 8) {
+ ext4_mb_discard_lg_preallocations(sb, lg,
+ order, lg_prealloc_count);
+ return;
+ }
+ return ;
+}
+
/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
- if (ac->ac_pa) {
- if (ac->ac_pa->pa_linear) {
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ if (pa) {
+ if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
- spin_lock(&ac->ac_pa->pa_lock);
- ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
- spin_unlock(&ac->ac_pa->pa_lock);
+ spin_lock(&pa->pa_lock);
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_free -= ac->ac_b_ex.fe_len;
+ pa->pa_len -= ac->ac_b_ex.fe_len;
+ spin_unlock(&pa->pa_lock);
+ }
+ }
+ if (pa) {
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
}
- ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page);
@@ -4272,11 +4345,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
int freed = 0;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+ trace_ext4_mb_discard_preallocations(sb, needed);
+ for (i = 0; i < ngroups && needed > 0; i++) {
ret = ext4_mb_discard_group_preallocations(sb, i, needed);
freed += ret;
needed -= ret;
@@ -4291,42 +4365,76 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
* to usual allocation
*/
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
- struct ext4_allocation_request *ar, int *errp)
+ struct ext4_allocation_request *ar, int *errp)
{
+ int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
- int freed;
- int inquota;
+ unsigned int inquota = 0;
+ unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
- if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
- &(ar->len), errp);
- return block;
- }
+ trace_ext4_request_blocks(ar);
- while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
- ar->flags |= EXT4_MB_HINT_NOPREALLOC;
- ar->len--;
- }
- if (ar->len == 0) {
- *errp = -EDQUOT;
- return 0;
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
+ /*
+ * For delayed allocation, we could skip the ENOSPC and
+ * EDQUOT check, as blocks and quotas have been already
+ * reserved when data being copied into pagecache.
+ */
+ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+ else {
+ /* Without delayed allocation we need to verify
+ * there is enough free blocks to do block allocation
+ * and verify allocation doesn't exceed the quota limits.
+ */
+ while (ar->len &&
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
+ /* let others to free the space */
+ cond_resched();
+ ar->len = ar->len >> 1;
+ }
+ if (!ar->len) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+ reserv_clstrs = ar->len;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
+ }
+ inquota = ar->len;
+ if (ar->len == 0) {
+ *errp = -EDQUOT;
+ goto out;
+ }
}
- inquota = ar->len;
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out;
}
- ext4_mb_poll_new_transaction(sb, handle);
-
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
@@ -4335,212 +4443,315 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
-
repeat:
/* allocate space in core */
- ext4_mb_regular_allocator(ac);
+ *errp = ext4_mb_regular_allocator(ac);
+ if (*errp)
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
-
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- ext4_mb_mark_diskspace_used(ac, handle);
- *errp = 0;
- block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- ar->len = ac->ac_b_ex.fe_len;
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+ if (*errp == -EAGAIN) {
+ /*
+ * drop the reference that we took
+ * in ext4_mb_use_best_found
+ */
+ ext4_mb_release_context(ac);
+ ac->ac_b_ex.fe_group = 0;
+ ac->ac_b_ex.fe_start = 0;
+ ac->ac_b_ex.fe_len = 0;
+ ac->ac_status = AC_STATUS_CONTINUE;
+ goto repeat;
+ } else if (*errp) {
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ } else {
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ ar->len = ac->ac_b_ex.fe_len;
+ }
} else {
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
if (freed)
goto repeat;
*errp = -ENOSPC;
+ }
+
+errout:
+ if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
}
-
ext4_mb_release_context(ac);
-
out:
- if (ar->len < inquota)
- DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+ if (ac)
+ kmem_cache_free(ext4_ac_cachep, ac);
+ if (inquota && ar->len < inquota)
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+ if (!ar->len) {
+ if (!ext4_test_inode_state(ar->inode,
+ EXT4_STATE_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
+ }
+
+ trace_ext4_allocate_blocks(ar, (unsigned long long)block);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
- handle_t *handle)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_last_transaction == handle->h_transaction->t_tid)
- return;
- /* new transaction! time to close last one and free blocks for
- * committed transaction. we know that only transaction can be
- * active, so previos transaction can be being logged and we
- * know that transaction before previous is known to be already
- * logged. this means that now we may free blocks freed in all
- * transactions before previous one. hope I'm clear enough ... */
-
- spin_lock(&sbi->s_md_lock);
- if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
- mb_debug("new transaction %lu, old %lu\n",
- (unsigned long) handle->h_transaction->t_tid,
- (unsigned long) sbi->s_last_transaction);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_closed_transaction);
- sbi->s_last_transaction = handle->h_transaction->t_tid;
- }
- spin_unlock(&sbi->s_md_lock);
-
- ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
+ return 1;
+ return 0;
}
-static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
- ext4_group_t group, ext4_grpblk_t block, int count)
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+ struct ext4_free_data *new_entry)
{
+ ext4_group_t group = e4b->bd_group;
+ ext4_grpblk_t cluster;
+ struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+ BUG_ON(!ext4_handle_valid(handle));
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
+
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
+ n = &(*n)->rb_left;
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_grp_locked_error(sb, group, 0,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
+ "Block already on to-be-freed list");
+ return 0;
}
+ }
- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
-
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
+ rb_erase(node, &(db->bb_free_root));
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
+ }
- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_count += entry->efd_count;
+ rb_erase(node, &(db->bb_free_root));
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
- ext4_unlock_group(sb, group);
+ /* Add the extent to transaction's private list */
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}
-/*
- * Main entry point into mballoc to free blocks
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ * @flags: flags used by ext4_free_blocks
*/
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
- unsigned long block, unsigned long count,
- int metadata, unsigned long *freed)
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
- struct ext4_allocation_context *ac = NULL;
struct ext4_group_desc *gdp;
- struct ext4_super_block *es;
- unsigned long overflow;
+ unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
+ unsigned int count_clusters;
int err = 0;
int ret;
- *freed = 0;
-
- ext4_mb_poll_new_transaction(sb, handle);
+ might_sleep();
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
sbi = EXT4_SB(sb);
- es = EXT4_SB(sb)->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
- ext4_error(sb, __FUNCTION__,
- "Freeing blocks not in datazone - "
- "block = %lu, count = %lu", block, count);
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_data_block_valid(sbi, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
goto error_return;
}
- ext4_debug("freeing block %lu\n", block);
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (flags & EXT4_FREE_BLOCKS_FORGET) {
+ struct buffer_head *tbh = bh;
+ int i;
+
+ BUG_ON(bh && (count > 1));
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (!bh)
+ tbh = sb_find_get_block(inode->i_sb,
+ block + i);
+ if (!tbh)
+ continue;
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, tbh, block + i);
+ }
+ }
+
+ /*
+ * We need to make sure we don't reuse the freed block until
+ * after the transaction is committed, which we can do by
+ * treating the block as metadata, below. We make an
+ * exception if the inode is to be written in writeback mode
+ * since writeback mode has weak data consistency guarantees.
+ */
+ if (!ext4_should_writeback_data(inode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
- if (ac) {
- ac->ac_op = EXT4_MB_HISTORY_FREE;
- ac->ac_inode = inode;
- ac->ac_sb = sb;
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
}
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
+ count_clusters = EXT4_NUM_B2C(sbi, count);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
goto error_return;
+ }
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!gdp)
+ if (!gdp) {
+ err = -EIO;
goto error_return;
+ }
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
+ EXT4_SB(sb)->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ EXT4_SB(sb)->s_itb_per_group)) {
- ext4_error(sb, __FUNCTION__,
- "Freeing blocks in system zone - "
- "Block = %lu, count = %lu", block, count);
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
+ goto error_return;
}
BUFFER_TRACE(bitmap_bh, "getting write access");
@@ -4557,58 +4768,101 @@ do_more:
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
-
#ifdef AGGRESSIVE_CHECK
{
int i;
- for (i = 0; i < count; i++)
+ for (i = 0; i < count_clusters; i++)
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
- bit, count);
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
- if (ac) {
- ac->ac_b_ex.fe_group = block_group;
- ac->ac_b_ex.fe_start = bit;
- ac->ac_b_ex.fe_len = count;
- ext4_mb_store_history(ac);
- }
+ if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+ struct ext4_free_data *new_entry;
+ /*
+ * blocks being freed are metadata. these blocks shouldn't
+ * be used until this transaction is committed
+ */
+ retry:
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+ if (!new_entry) {
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
- if (metadata) {
- /* blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed */
- ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
+ /* need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
- ext4_mb_return_to_preallocation(inode, &e4b, block, count);
- ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- spin_lock(sb_bgl_lock(sbi, block_group));
- gdp->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
- ext4_mb_release_desc(&e4b);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic64_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
- *freed += count;
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_journal_dirty_metadata(handle, gd_bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
if (!err)
err = ret;
@@ -4618,11 +4872,365 @@ do_more:
put_bh(bitmap_bh);
goto do_more;
}
- sb->s_dirt = 1;
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
return;
}
+
+/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physical block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ if (count == 0)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too much blocks added to group %u\n",
+ block_group);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, blocks_freed));
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb: super block for the file system
+ * @start: starting block of the free extent in the alloc. group
+ * @count: number of blocks to TRIM
+ * @group: alloc. group we are working with
+ * @e4b: ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+ ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
+{
+ struct ext4_free_extent ex;
+ int ret = 0;
+
+ trace_ext4_trim_extent(sb, group, start, count);
+
+ assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+ ex.fe_start = start;
+ ex.fe_group = group;
+ ex.fe_len = count;
+
+ /*
+ * Mark blocks used, so no one can reuse them while
+ * being trimmed.
+ */
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, group);
+ ret = ext4_issue_discard(sb, group, start, count);
+ ext4_lock_group(sb, group);
+ mb_free_blocks(NULL, e4b, start, ex.fe_len);
+ return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: group to be trimmed
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @minblocks: minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
+{
+ void *bitmap;
+ ext4_grpblk_t next, count = 0, free_count = 0;
+ struct ext4_buddy e4b;
+ int ret = 0;
+
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
+
+ ext4_lock_group(sb, group);
+ if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+ minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+ goto out;
+
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
+ break;
+ next = mb_find_next_bit(bitmap, max + 1, start);
+
+ if ((next - start) >= minblocks) {
+ ret = ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ count += next - start;
+ }
+ free_count += next - start;
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if (need_resched()) {
+ ext4_unlock_group(sb, group);
+ cond_resched();
+ ext4_lock_group(sb, group);
+ }
+
+ if ((e4b.bd_info->bb_free - free_count) < minblocks)
+ break;
+ }
+
+ if (!ret) {
+ ret = count;
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
+out:
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+
+ ext4_debug("trimmed %d blocks in the group %d\n",
+ count, group);
+
+ return ret;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb: superblock for filesystem
+ * @range: fstrim_range structure
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ext4_group_info *grp;
+ ext4_group_t group, first_group, last_group;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+ uint64_t start, end, minlen, trimmed = 0;
+ ext4_fsblk_t first_data_blk =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+ int ret = 0;
+
+ start = range->start >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
+
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
+ return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
+
+ /* Determine first and last group to examine based on start and end */
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+ &first_group, &first_cluster);
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+ &last_group, &last_cluster);
+
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+ for (group = first_group; group <= last_group; group++) {
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
+ }
+
+ /*
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_cluster;
+
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ end, minlen);
+ if (cnt < 0) {
+ ret = cnt;
+ break;
+ }
+ trimmed += cnt;
+ }
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
+ first_cluster = 0;
+ }
+
+ if (!ret)
+ atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
+ return ret;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
new file mode 100644
index 00000000000..d634e183b4d
--- /dev/null
+++ b/fs/ext4/mballoc.h
@@ -0,0 +1,215 @@
+/*
+ * fs/ext4/mballoc.h
+ *
+ * Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#ifdef CONFIG_EXT4_DEBUG
+extern ushort ext4_mballoc_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= ext4_mballoc_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
+#else
+#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
+#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN 200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN 10
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS 0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS 2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC 512
+
+
+struct ext4_free_data {
+ /* MUST be the first member */
+ struct ext4_journal_cb_entry efd_jce;
+
+ /* ext4_free_data private data starts from here */
+
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;
+
+ /* group which free block extent belongs */
+ ext4_group_t efd_group;
+
+ /* free block extent */
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;
+
+ /* transaction which freed this extent */
+ tid_t efd_tid;
+};
+
+struct ext4_prealloc_space {
+ struct list_head pa_inode_list;
+ struct list_head pa_group_list;
+ union {
+ struct list_head pa_tmp_list;
+ struct rcu_head pa_rcu;
+ } u;
+ spinlock_t pa_lock;
+ atomic_t pa_count;
+ unsigned pa_deleted;
+ ext4_fsblk_t pa_pstart; /* phys. block */
+ ext4_lblk_t pa_lstart; /* log. block */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
+ unsigned short pa_type; /* pa type. inode or group */
+ spinlock_t *pa_obj_lock;
+ struct inode *pa_inode; /* hack, for history only */
+};
+
+enum {
+ MB_INODE_PA = 0,
+ MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+ ext4_lblk_t fe_logical;
+ ext4_grpblk_t fe_start; /* In cluster units */
+ ext4_group_t fe_group;
+ ext4_grpblk_t fe_len; /* In cluster units */
+};
+
+/*
+ * Locality group:
+ * we try to group all related changes together
+ * so that writeback can flush/allocate them together as well
+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ * (512). We store prealloc space into the hash based on the pa_free blocks
+ * order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+ /* for allocator */
+ /* to serialize allocates */
+ struct mutex lg_mutex;
+ /* list of preallocations */
+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
+ spinlock_t lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+ struct inode *ac_inode;
+ struct super_block *ac_sb;
+
+ /* original request */
+ struct ext4_free_extent ac_o_ex;
+
+ /* goal request (normalized ac_o_ex) */
+ struct ext4_free_extent ac_g_ex;
+
+ /* the best found extent */
+ struct ext4_free_extent ac_b_ex;
+
+ /* copy of the best found extent taken before preallocation efforts */
+ struct ext4_free_extent ac_f_ex;
+
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
+ __u16 ac_tail;
+ __u16 ac_buddy;
+ __u16 ac_flags; /* allocation hints */
+ __u8 ac_status;
+ __u8 ac_criteria;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+ * N > 0, the field stores N, otherwise 0 */
+ __u8 ac_op; /* operation, for history only */
+ struct page *ac_bitmap_page;
+ struct page *ac_buddy_page;
+ struct ext4_prealloc_space *ac_pa;
+ struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
+struct ext4_buddy {
+ struct page *bd_buddy_page;
+ void *bd_buddy;
+ struct page *bd_bitmap_page;
+ void *bd_bitmap;
+ struct ext4_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ ext4_group_t bd_group;
+};
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+ struct ext4_free_extent *fex)
+{
+ return ext4_group_first_block_no(sb, fex->fe_group) +
+ (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
+}
+#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 5c1e27de775..ec092437d3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,21 +12,21 @@
*
*/
-#include <linux/module.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs_extents.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
/*
* The contiguous blocks details which can be
* represented by a single extent
*/
-struct list_blocks_struct {
- ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+ ext4_lblk_t first_block, last_block, curr_block;
ext4_fsblk_t first_pblock, last_pblock;
};
static int finish_range(handle_t *handle, struct inode *inode,
- struct list_blocks_struct *lb)
+ struct migrate_struct *lb)
{
int retval = 0, needed;
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
@@ -53,12 +53,14 @@ static int finish_range(handle_t *handle, struct inode *inode,
* credit. But below we try to not accumalate too much
* of them by restarting the journal.
*/
- needed = ext4_ext_calc_credits_for_insert(inode, path);
+ needed = ext4_ext_calc_credits_for_single_extent(inode,
+ lb->last_block - lb->first_block + 1, path);
/*
* Make sure the credit we accumalated is not really high
*/
- if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+ if (needed && ext4_handle_has_enough_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS)) {
retval = ext4_journal_restart(handle, needed);
if (retval)
goto err_out;
@@ -73,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
@@ -84,8 +86,7 @@ err_out:
}
static int update_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t blk_num,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock, struct migrate_struct *lb)
{
int retval;
/*
@@ -93,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
if (lb->first_pblock &&
(lb->last_pblock+1 == pblock) &&
- (lb->last_block+1 == blk_num)) {
+ (lb->last_block+1 == lb->curr_block)) {
lb->last_pblock = pblock;
- lb->last_block = blk_num;
+ lb->last_block = lb->curr_block;
+ lb->curr_block++;
return 0;
}
/*
@@ -103,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
*/
retval = finish_range(handle, inode, lb);
lb->first_pblock = lb->last_pblock = pblock;
- lb->first_block = lb->last_block = blk_num;
-
+ lb->first_block = lb->last_block = lb->curr_block;
+ lb->curr_block++;
return retval;
}
static int update_ind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries;
- return 0;
- }
-
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
i_data = (__le32 *)bh->b_data;
- for (i = 0; i < max_entries; i++, blk_count++) {
+ for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
+ } else {
+ lb->curr_block++;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_dind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -169,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_ind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
} else {
/* Only update the file block number */
- blk_count += max_entries;
+ lb->curr_block += max_entries;
}
}
-
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
}
static int update_tind_extent_range(handle_t *handle, struct inode *inode,
- ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
- struct list_blocks_struct *lb)
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
{
struct buffer_head *bh;
__le32 *i_data;
int i, retval = 0;
- ext4_lblk_t blk_count = *blk_nump;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- if (!pblock) {
- /* Only update the file block number */
- *blk_nump += max_entries * max_entries * max_entries;
- return 0;
- }
bh = sb_bread(inode->i_sb, pblock);
if (!bh)
return -EIO;
@@ -209,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
for (i = 0; i < max_entries; i++) {
if (i_data[i]) {
retval = update_dind_extent_range(handle, inode,
- le32_to_cpu(i_data[i]),
- &blk_count, lb);
+ le32_to_cpu(i_data[i]), lb);
if (retval)
break;
- } else
+ } else {
/* Only update the file block number */
- blk_count += max_entries * max_entries;
+ lb->curr_block += max_entries * max_entries;
+ }
}
- /* Update the file block number */
- *blk_nump = blk_count;
put_bh(bh);
return retval;
@@ -228,7 +203,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
{
int retval = 0, needed;
- if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
return 0;
/*
* We are freeing a blocks. During this we touch
@@ -236,7 +211,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
* So allocate a credit of 3. We may update
* quota (user and group).
*/
- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
@@ -260,13 +235,17 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(tmp_idata[i]), 1, 1);
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(tmp_idata[i]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -295,7 +274,9 @@ static int free_tind_blocks(handle_t *handle,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -306,8 +287,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(i_data[0]), 1, 1);
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(i_data[0]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
/* ei->i_data[EXT4_DIND_BLOCK] */
@@ -327,7 +310,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
}
static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
- struct inode *tmp_inode)
+ struct inode *tmp_inode)
{
int retval;
__le32 i_data[3];
@@ -339,7 +322,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* i_data field of the original inode
*/
retval = ext4_journal_extend(handle, 1);
- if (retval != 0) {
+ if (retval) {
retval = ext4_journal_restart(handle, 1);
if (retval)
goto err_out;
@@ -351,10 +334,21 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+ * happened after we started the migrate. We need to
+ * fail the migrate
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+ retval = -EAGAIN;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto err_out;
+ } else
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ /*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
*/
- ei->i_flags |= EXT4_EXTENTS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/*
@@ -390,7 +384,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
struct buffer_head *bh;
struct ext4_extent_header *eh;
- block = idx_pblock(ix);
+ block = ext4_idx_pblock(ix);
bh = sb_bread(inode->i_sb, block);
if (!bh)
return -EIO;
@@ -406,7 +400,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, block, 1, 1);
+ ext4_free_blocks(handle, inode, NULL, block, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return retval;
}
@@ -431,28 +426,27 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
-
}
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
{
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
- ext4_lblk_t blk_count = 0;
struct ext4_inode_info *ei;
struct inode *tmp_inode = NULL;
- struct list_blocks_struct lb;
+ struct migrate_struct lb;
unsigned long max_entries;
+ __u32 goal;
+ uid_t owner[2];
- if (!test_opt(inode->i_sb, EXTENTS))
- /*
- * if mounted with noextents we don't allow the migrate
- */
- return -EINVAL;
-
- if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ /*
+ * If the filesystem does not support extents, or the inode
+ * already is extent-based, error out.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -461,34 +455,35 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
*/
return retval;
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
- + 1);
+ /*
+ * Worst case we can touch the allocation bitmaps, a bgd
+ * block, and a block to link in the orphan list. We do need
+ * need to worry about credits for modifying the quota inode.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+ 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- goto err_out;
+ return retval;
}
- tmp_inode = ext4_new_inode(handle,
- inode->i_sb->s_root->d_inode,
- S_IFREG);
+ goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ owner[0] = i_uid_read(inode);
+ owner[1] = i_gid_read(inode);
+ tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
- retval = -ENOMEM;
+ retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
- tmp_inode = NULL;
- goto err_out;
+ return retval;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
- * We don't want the inode to be reclaimed
- * if we got interrupted in between. We have
- * this tmp inode carrying reference to the
- * data blocks of the original file. We set
- * the i_nlink to zero at the last stage after
- * switching the original file to extent format
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
*/
- tmp_inode->i_nlink = 1;
+ clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -498,17 +493,33 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
* start with one credit accounted for
* superblock modification.
*
- * For the tmp_inode we already have commited the
- * trascation that created the inode. Later as and
+ * For the tmp_inode we already have committed the
+ * transaction that created the inode. Later as and
* when we add extents we extent the journal
*/
/*
- * inode_mutex prevent write and truncate on the file. Read still goes
- * through. We take i_data_sem in ext4_ext_swap_inode_data before we
- * switch the inode format to prevent read.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
- mutex_lock(&(inode->i_mutex));
- handle = ext4_journal_start(inode, 1);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle)) {
+ /*
+ * It is impossible to update on-disk structures without
+ * a handle, so just rollback in-core changes and live other
+ * work to orphan_list_cleanup()
+ */
+ ext4_orphan_del(NULL, tmp_inode);
+ retval = PTR_ERR(handle);
+ goto out;
+ }
ei = EXT4_I(inode);
i_data = ei->i_data;
@@ -516,35 +527,32 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
/* 32 bit block address 4 bytes */
max_entries = inode->i_sb->s_blocksize >> 2;
- for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+ for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
if (i_data[i]) {
retval = update_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[i]),
- blk_count, &lb);
+ le32_to_cpu(i_data[i]), &lb);
if (retval)
goto err_out;
- }
+ } else
+ lb.curr_block++;
}
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_IND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries;
+ lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
if (retval)
goto err_out;
} else
- blk_count += max_entries * max_entries;
+ lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
- le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
- &blk_count, &lb);
+ le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
if (retval)
goto err_out;
}
@@ -559,9 +567,15 @@ err_out:
* tmp_inode
*/
free_ext_block(handle, tmp_inode);
- else
- retval = ext4_ext_swap_inode_data(handle, inode,
- tmp_inode);
+ else {
+ retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+ if (retval)
+ /*
+ * if we fail to swap inode data free the extent
+ * details of the tmp inode
+ */
+ free_ext_block(handle, tmp_inode);
+ }
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
if (ext4_journal_extend(handle, 1) != 0)
@@ -584,19 +598,71 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
+ ext4_journal_stop(handle);
+out:
+ unlock_new_inode(tmp_inode);
+ iput(tmp_inode);
- /*
- * Set the i_nlink to zero so that
- * generic_drop_inode really deletes the
- * inode
- */
- tmp_inode->i_nlink = 0;
+ return retval;
+}
- ext4_journal_stop(handle);
- mutex_unlock(&(inode->i_mutex));
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent *ex;
+ unsigned int i, len;
+ ext4_fsblk_t blk;
+ handle_t *handle;
+ int ret;
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
- if (tmp_inode)
- iput(tmp_inode);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return -EOPNOTSUPP;
- return retval;
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_check_inode(inode);
+ if (ret)
+ goto errout;
+
+ eh = ext_inode_hdr(inode);
+ ex = EXT_FIRST_EXTENT(eh);
+ if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+ eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ if (eh->eh_entries == 0)
+ blk = len = 0;
+ else {
+ len = le16_to_cpu(ex->ee_len);
+ blk = ext4_ext_pblock(ex);
+ if (len > EXT4_NDIR_BLOCKS) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ for (i=0; i < len; i++)
+ ei->i_data[i] = cpu_to_le32(blk++);
+ ext4_mark_inode_dirty(handle, inode);
+errout:
+ ext4_journal_stop(handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 00000000000..32bce844c2e
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,395 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/* Checksumming functions */
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct mmp_struct, mmp_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+ struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
+ ext4_mmp_csum_set(sb, mmp);
+ mark_buffer_dirty(bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ sb_end_write(sb);
+ if (unlikely(!buffer_uptodate(bh)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh)
+ *bh = sb_getblk(sb, mmp_block);
+ if (!*bh)
+ return -ENOMEM;
+ if (*bh) {
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ }
+ if (unlikely(!*bh)) {
+ ext4_warning(sb, "Error while reading MMP block %llu",
+ mmp_block);
+ return -EIO;
+ }
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+ !ext4_mmp_csum_verify(sb, mmp))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update "
+ "node: %s, last update device: %s\n",
+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+ mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop()) {
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(sb, bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval) {
+ if ((failed_writes % 60) == 0)
+ ext4_error(sb, "Error writing to MMP block");
+ failed_writes++;
+ }
+
+ if (!(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_MMP)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_warning(sb, "kmmpd being stopped since filesystem "
+ "has been remounted as readonly.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error(sb, "error reading MMP data: %d",
+ retval);
+
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error(sb, "abort");
+ goto failed;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MAX_CHECK_INTERVAL),
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+
+ retval = write_mmp_block(sb, bh);
+
+failed:
+ kfree(data);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ u32 new_seq;
+
+ do {
+ new_seq = prandom_u32();
+ } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+ return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ struct mmpd_data *mmpd_data;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+ mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ seq = mmp_new_seq();
+ mmp->mmp_seq = cpu_to_le32(seq);
+
+ retval = write_mmp_block(sb, bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ if (!mmpd_data) {
+ ext4_warning(sb, "not enough memory for mmpd_data");
+ goto failed;
+ }
+ mmpd_data->sb = sb;
+ mmpd_data->bh = bh;
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ bdevname(bh->b_bdev,
+ mmp->mmp_bdevname));
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ kfree(mmpd_data);
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 00000000000..2484c7ec6a7
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1505 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "ext4_extents.h"
+
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **orig_path)
+{
+ int ret = 0;
+ struct ext4_ext_path *path;
+
+ path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ ret = PTR_ERR(path);
+ else if (path[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+ else
+ *orig_path = path;
+
+ return ret;
+}
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src: an extent for getting initialize status
+ * @dest: an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+ if (ext4_ext_is_unwritten(src))
+ ext4_ext_mark_unwritten(dest);
+ else
+ dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ struct ext4_extent_header *eh;
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ return -EIO;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ ext4_idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ return -EIO;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext4_ext_pblock(path[leaf_ppos].p_ext);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+}
+
+/**
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ * of i_data_sem
+ *
+ * Acquire write lock of i_data_sem of the two inodes
+ */
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
+{
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+
+ }
+}
+
+/**
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+ struct inode *donor_inode)
+{
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ ext4_lblk_t eblock = 0;
+ int new_flag = 0;
+ int end_flag = 0;
+ int err = 0;
+
+ if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+ if (o_start == o_end) {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|-----------|--------|
+ * orig |------------------------------|
+ */
+ end_flag = 1;
+ } else {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|----------|---------|
+ * orig |---------------|--------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+ }
+
+ o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (start_ext->ee_len && new_ext->ee_len &&
+ !end_ext->ee_len && o_start == o_end) {
+
+ /* start_ext new_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (!start_ext->ee_len && new_ext->ee_len &&
+ end_ext->ee_len && o_start == o_end) {
+
+ /* new_ext end_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+
+ /*
+ * Set 0 to the extent block if new_ext was
+ * the first block.
+ */
+ if (new_ext->ee_block)
+ eblock = le32_to_cpu(new_ext->ee_block);
+
+ new_flag = 1;
+ } else {
+ ext4_debug("ext4 move extent: Unexpected insert case\n");
+ return -EIO;
+ }
+
+ if (new_flag) {
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, new_ext, 0))
+ goto out;
+ }
+
+ if (end_flag) {
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, end_ext, 0))
+ goto out;
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+
+ return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start: first original extent to be moved
+ * @o_end: last original extent to be moved
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ * @eh: extent header of target leaf block
+ * @range_to_move: used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext,
+ struct ext4_extent_header *eh,
+ int range_to_move)
+{
+ int i = 0;
+ unsigned long len;
+
+ /* Move the existing extents */
+ if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+ len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+ (unsigned long)(o_end + 1);
+ memmove(o_end + 1 + range_to_move, o_end + 1, len);
+ }
+
+ /* Insert start entry */
+ if (start_ext->ee_len)
+ o_start[i++].ee_len = start_ext->ee_len;
+
+ /* Insert new entry */
+ if (new_ext->ee_len) {
+ o_start[i] = *new_ext;
+ ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
+ }
+
+ /* Insert end entry */
+ if (end_ext->ee_len)
+ o_start[i] = *end_ext;
+
+ /* Increment the total entries counter on the extent block */
+ le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path,
+ struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_extent_header *eh;
+ unsigned long need_slots, slots_range;
+ int range_to_move, depth, ret;
+
+ /*
+ * The extents need to be inserted
+ * start_extent + new_extent + end_extent.
+ */
+ need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+ (new_ext->ee_len ? 1 : 0);
+
+ /* The number of slots between start and end */
+ slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+ / sizeof(struct ext4_extent);
+
+ /* Range to move the end of extent */
+ range_to_move = need_slots - slots_range;
+ depth = orig_path->p_depth;
+ orig_path += depth;
+ eh = orig_path->p_hdr;
+
+ if (depth) {
+ /* Register to journal */
+ BUFFER_TRACE(orig_path->p_bh, "get_write_access");
+ ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+ if (ret)
+ return ret;
+ }
+
+ /* Expansion */
+ if (range_to_move > 0 &&
+ (range_to_move > le16_to_cpu(eh->eh_max)
+ - le16_to_cpu(eh->eh_entries))) {
+
+ ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+ o_end, start_ext, new_ext, end_ext);
+ if (ret < 0)
+ return ret;
+ } else
+ mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+ end_ext, eh, range_to_move);
+
+ return ext4_ext_dirty(handle, orig_inode, orig_path);
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @dext: donor extent
+ * @from: start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_lblk_t new_ext_end;
+ int oext_alen, new_ext_alen, end_ext_alen;
+ int depth = ext_depth(orig_inode);
+ int ret;
+
+ start_ext.ee_block = end_ext.ee_block = 0;
+ o_start = o_end = oext = orig_path[depth].p_ext;
+ oext_alen = ext4_ext_get_actual_len(oext);
+ start_ext.ee_len = end_ext.ee_len = 0;
+
+ new_ext.ee_block = cpu_to_le32(*from);
+ ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+ new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+
+ /*
+ * Case: original extent is first
+ * oext |--------|
+ * new_ext |--|
+ * start_ext |--|
+ */
+ if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block) + oext_alen) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+ le32_to_cpu(oext->ee_block));
+ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(oext, &start_ext);
+ } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+ prev_ext = oext - 1;
+ /*
+ * We can merge new_ext into previous extent,
+ * if these are contiguous and same extent type.
+ */
+ if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+ &new_ext)) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(prev_ext) +
+ new_ext_alen);
+ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(prev_ext, &start_ext);
+ new_ext.ee_len = 0;
+ }
+ }
+
+ /*
+ * Case: new_ext_end must be less than oext
+ * oext |-----------|
+ * new_ext |-------|
+ */
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ EXT4_ERROR_INODE(orig_inode,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * Case: new_ext is smaller than original extent
+ * oext |---------------|
+ * new_ext |-----------|
+ * end_ext |---|
+ */
+ if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+ new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+ end_ext.ee_len =
+ cpu_to_le16(le32_to_cpu(oext->ee_block) +
+ oext_alen - 1 - new_ext_end);
+ copy_extent_status(oext, &end_ext);
+ end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+ ext4_ext_store_pblock(&end_ext,
+ (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
+ end_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+ oext_alen - end_ext_alen);
+ }
+
+ ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+ o_end, &start_ext, &new_ext, &end_ext);
+out:
+ return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext: the extent that will belong to the original inode
+ * @tmp_oext: the extent that will belong to the donor inode
+ * @orig_off: block offset of original inode
+ * @donor_off: block offset of donor inode
+ * @max_count: the maximum length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+ struct ext4_extent *tmp_oext,
+ ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+ ext4_lblk_t max_count)
+{
+ ext4_lblk_t diff, orig_diff;
+ struct ext4_extent dext_old, oext_old;
+
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
+ dext_old = *tmp_dext;
+ oext_old = *tmp_oext;
+
+ /* When tmp_dext is too large, pick up the target range. */
+ diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+ ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
+ le32_add_cpu(&tmp_dext->ee_block, diff);
+ le16_add_cpu(&tmp_dext->ee_len, -diff);
+
+ if (max_count < ext4_ext_get_actual_len(tmp_dext))
+ tmp_dext->ee_len = cpu_to_le16(max_count);
+
+ orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+ ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
+
+ /* Adjust extent length if donor extent is larger than orig */
+ if (ext4_ext_get_actual_len(tmp_dext) >
+ ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+ orig_diff);
+
+ tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+ copy_extent_status(&oext_old, tmp_dext);
+ copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
+}
+
+/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode: inode in question
+ * @from: block offset of inode
+ * @count: block count to be checked
+ * @unwritten: extents expected to be unwritten
+ * @err: pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+ int unwritten, int *err)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ext;
+ int ret = 0;
+ ext4_lblk_t last = from + count;
+ while (from < last) {
+ *err = get_ext_path(inode, from, &path);
+ if (*err)
+ goto out;
+ ext = path[ext_depth(inode)].p_ext;
+ if (unwritten != ext4_ext_is_unwritten(ext))
+ goto out;
+ from += ext4_ext_get_actual_len(ext);
+ ext4_ext_drop_refs(path);
+ }
+ ret = 1;
+out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return ret;
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @from: block offset of orig_inode
+ * @count: block count to be replaced
+ * @err: pointer to save return value
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ * dummy extents.
+ * 2. Change the block information of original inode to point at the
+ * donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ * original inode blocks in the dummy extents.
+ *
+ * Return replaced block count.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+ struct inode *donor_inode, ext4_lblk_t from,
+ ext4_lblk_t count, int *err)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ struct ext4_ext_path *donor_path = NULL;
+ struct ext4_extent *oext, *dext;
+ struct ext4_extent tmp_dext, tmp_oext;
+ ext4_lblk_t orig_off = from, donor_off = from;
+ int depth;
+ int replaced_count = 0;
+ int dext_alen;
+
+ *err = ext4_es_remove_extent(orig_inode, from, count);
+ if (*err)
+ goto out;
+
+ *err = ext4_es_remove_extent(donor_inode, from, count);
+ if (*err)
+ goto out;
+
+ /* Get the original extent for the block "orig_off" */
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+ goto out;
+
+ /* Get the donor extent for the head */
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ if (unlikely(!dext))
+ goto missing_donor_extent;
+ tmp_dext = *dext;
+
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
+ if (*err)
+ goto out;
+
+ /* Loop for the donor extents */
+ while (1) {
+ /* The extent for donor must be found. */
+ if (unlikely(!dext)) {
+ missing_donor_extent:
+ EXT4_ERROR_INODE(donor_inode,
+ "The extent for donor must be found");
+ *err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ EXT4_ERROR_INODE(donor_inode,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ *err = -EIO;
+ goto out;
+ }
+
+ /* Set donor extent to orig extent */
+ *err = mext_leaf_block(handle, orig_inode,
+ orig_path, &tmp_dext, &orig_off);
+ if (*err)
+ goto out;
+
+ /* Set orig extent to donor extent */
+ *err = mext_leaf_block(handle, donor_inode,
+ donor_path, &tmp_oext, &donor_off);
+ if (*err)
+ goto out;
+
+ dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+ replaced_count += dext_alen;
+ donor_off += dext_alen;
+ orig_off += dext_alen;
+
+ BUG_ON(replaced_count > count);
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (*err)
+ goto out;
+ }
+
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (donor_path) {
+ ext4_ext_drop_refs(donor_path);
+ kfree(donor_path);
+ }
+
+ return replaced_count;
+}
+
+/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ * @index: page index
+ * @page: result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index, struct page *page[2])
+{
+ struct address_space *mapping[2];
+ unsigned fl = AOP_FLAG_NOFS;
+
+ BUG_ON(!inode1 || !inode2);
+ if (inode1 < inode2) {
+ mapping[0] = inode1->i_mapping;
+ mapping[1] = inode2->i_mapping;
+ } else {
+ mapping[0] = inode2->i_mapping;
+ mapping[1] = inode1->i_mapping;
+ }
+
+ page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+ if (!page[0])
+ return -ENOMEM;
+
+ page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+ if (!page[1]) {
+ unlock_page(page[0]);
+ page_cache_release(page[0]);
+ return -ENOMEM;
+ }
+ /*
+ * grab_cache_page_write_begin() may not wait on page's writeback if
+ * BDI not demand that. But it is reasonable to be very conservative
+ * here and explicitly wait on page's writeback
+ */
+ wait_on_page_writeback(page[0]);
+ wait_on_page_writeback(page[1]);
+ if (inode1 > inode2) {
+ struct page *tmp;
+ tmp = page[0];
+ page[0] = page[1];
+ page[1] = tmp;
+ }
+ return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ sector_t block;
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ unsigned int blocksize, block_start, block_end;
+ int i, err, nr = 0, partial = 0;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (PageUptodate(page))
+ return 0;
+
+ blocksize = 1 << inode->i_blkbits;
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ head = page_buffers(page);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ continue;
+ }
+ if (buffer_uptodate(bh))
+ continue;
+ if (!buffer_mapped(bh)) {
+ err = ext4_get_block(inode, block, bh, 0);
+ if (err) {
+ SetPageError(page);
+ return err;
+ }
+ if (!buffer_mapped(bh)) {
+ zero_user(page, block_start, blocksize);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+ BUG_ON(nr >= MAX_BUF_PER_PAGE);
+ arr[nr++] = bh;
+ }
+ /* No io required */
+ if (!nr)
+ goto out;
+
+ for (i = 0; i < nr; i++) {
+ bh = arr[i];
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
+ if (err)
+ return err;
+ }
+ }
+out:
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp: file structure of original file
+ * @donor_inode: donor inode
+ * @orig_page_offset: page index on original file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @unwritten: orig extent is unwritten or not
+ * @err: pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+ int block_len_in_page, int unwritten, int *err)
+{
+ struct inode *orig_inode = file_inode(o_filp);
+ struct page *pagep[2] = {NULL, NULL};
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+ unsigned int tmp_data_size, data_size, replaced_size;
+ int err2, jblocks, retries = 0;
+ int replaced_count = 0;
+ int from = data_offset_in_page << orig_inode->i_blkbits;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and donor_inode may change each different metadata blocks.
+ */
+again:
+ *err = 0;
+ jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+ if (IS_ERR(handle)) {
+ *err = PTR_ERR(handle);
+ return 0;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ orig_blk_offset = orig_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ /* Calculate data_size */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
+ /*
+ * If data_size equal zero, it shows data_size is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
+
+ data_size = tmp_data_size +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
+
+ replaced_size = data_size;
+
+ *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+ pagep);
+ if (unlikely(*err < 0))
+ goto stop_journal;
+ /*
+ * If orig extent was unwritten it can become initialized
+ * at any time after i_data_sem was dropped, in order to
+ * serialize with delalloc we have recheck extent while we
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
+ */
+ if (unwritten) {
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+ * fallback to data copying */
+ unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ if (!unwritten) {
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+ if ((page_has_private(pagep[0]) &&
+ !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) &&
+ !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+ drop_data_sem:
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto unlock_pages;
+ }
+data_copy:
+ *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ if (*err)
+ goto unlock_pages;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+ if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto unlock_pages;
+ }
+
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset,
+ block_len_in_page, err);
+ if (*err) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto unlock_pages;
+ }
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+ *err = __block_write_begin(pagep[0], from, replaced_size,
+ ext4_get_block);
+ if (!*err)
+ *err = block_commit_write(pagep[0], from, from + replaced_size);
+
+ if (unlikely(*err < 0))
+ goto repair_branches;
+
+ /* Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss */
+ *err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+ unlock_page(pagep[0]);
+ page_cache_release(pagep[0]);
+ unlock_page(pagep[1]);
+ page_cache_release(pagep[1]);
+stop_journal:
+ ext4_journal_stop(handle);
+ /* Buffer was busy because probably is pinned to journal transaction,
+ * force transaction commit may help to free it. */
+ if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+ &retries))
+ goto again;
+ return replaced_count;
+
+repair_branches:
+ /*
+ * This should never ever happen!
+ * Extents are swapped already, but we are not able to copy data.
+ * Try to swap extents to it's original places
+ */
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+ orig_blk_offset,
+ block_len_in_page, &err2);
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ if (replaced_count != block_len_in_page) {
+ EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+ "Unable to copy data block,"
+ " data will be lost.");
+ *err = -EIO;
+ }
+ replaced_count = 0;
+ goto unlock_pages;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
+{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ return -EPERM;
+
+ /* Ext4 move extent does not support swapfile */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be swapfile [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+ ext4_debug("ext4 move extent: File size is 0 byte\n");
+ return -EINVAL;
+ }
+
+ /* Start offset should be same */
+ if (orig_start != donor_start) {
+ ext4_debug("ext4 move extent: orig and donor's start "
+ "offset are not same [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (*len > EXT_MAX_BLOCKS) ||
+ (orig_start + *len >= EXT_MAX_BLOCKS)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_inode->i_size > donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
+ ext4_debug("ext4 move extent: orig start offset "
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
+ ext4_debug("ext4 move extent: End offset [%llu] should "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
+ "[ino:orig %lu, donor %lu]\n",
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = donor_blocks - orig_start;
+ }
+ } else {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
+ ext4_debug("ext4 move extent: start offset [%llu] "
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + *len > orig_blocks) {
+ ext4_debug("ext4 move extent: Adjust length "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
+ "[ino:orig %lu, donor %lu]\n",
+ *len, orig_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = orig_blocks - orig_start;
+ }
+ }
+
+ if (!*len) {
+ ext4_debug("ext4 move extent: len should not be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the donor file
+ * @orig_start: start offset in block for orig
+ * @donor_start: start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ * function by the start block number (orig_start) and the number of blocks
+ * to be moved (len) specified as arguments.
+ * If the {orig, donor}_start points a hole, the extent's start offset
+ * pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ * after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ * specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ * from orig_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is orig_page_offset,
+ * and the donor inode is also orig_page_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ * then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ * which shows the number of moved blocks.
+ * The moved_len is useful for the command to calculate the file offset
+ * for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 orig_start, __u64 donor_start, __u64 len,
+ __u64 *moved_len)
+{
+ struct inode *orig_inode = file_inode(o_filp);
+ struct inode *donor_inode = file_inode(d_filp);
+ struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_start = orig_start;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t orig_page_offset = 0, seq_end_page;
+ int ret, depth, last_extent = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int data_offset_in_page;
+ int block_len_in_page;
+ int unwritten;
+
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same inode [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+ /* TODO: This is non obvious task to swap blocks for inodes with full
+ jornaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ return -EINVAL;
+ }
+ /* Protect orig and donor inodes against a truncate */
+ lock_two_nondirectories(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
+
+ /* Protect extent tree against block allocations via delalloc */
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len);
+ if (ret)
+ goto out;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+ block_end = block_start + len - 1;
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret)
+ goto out;
+
+ /* Get path structure to check the hole */
+ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+
+ /*
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
+ last_extent = mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = mext_next_extent(orig_inode, orig_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ ext4_debug("ext4 move extent: The specified range of file "
+ "may be the hole\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = mext_next_extent(orig_inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (ext4_can_extents_be_merged(orig_inode,
+ ext_prev, ext_cur) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent)
+ continue;
+
+ /* Is original extent is unwritten */
+ unwritten = ext4_ext_is_unwritten(ext_prev);
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ orig_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+ block_len_in_page,
+ unwritten, &ret);
+
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+ if (ret < 0)
+ break;
+ if (*moved_len > len) {
+ EXT4_ERROR_INODE(orig_inode,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret = -EIO;
+ break;
+ }
+
+ orig_page_offset++;
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret < 0)
+ break;
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret)
+ break;
+ depth = holecheck_path->p_depth;
+
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret)
+ break;
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+ seq_blocks = 0;
+
+ }
+out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
+ unlock_two_nondirectories(orig_inode, donor_inode);
+
+ return ret;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 28aa2ed4297..3520ab8a663 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -28,39 +28,131 @@
#include <linux/pagemap.h>
#include <linux/jbd2.h>
#include <linux/time.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
-#include "namei.h"
#include "xattr.h"
#include "acl.h"
+#include <trace/events/ext4.h>
/*
* define how far ahead to read directories while searching them.
*/
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
- ext4_lblk_t *block, int *err)
+ ext4_lblk_t *block)
{
struct buffer_head *bh;
+ int err = 0;
+
+ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+ ((inode->i_size >> 10) >=
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+ return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
- inode->i_size += inode->i_sb->s_blocksize;
- EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_journal_get_write_access(handle,bh);
+ bh = ext4_bread(handle, inode, *block, 1, &err);
+ if (!bh)
+ return ERR_PTR(err);
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
+ }
+ return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
+
+typedef enum {
+ EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+ __ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+ ext4_lblk_t block,
+ dirblock_type_t type,
+ unsigned int line)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry *dirent;
+ int err = 0, is_dx_block = 0;
+
+ bh = ext4_bread(NULL, inode, block, 0, &err);
+ if (!bh) {
+ if (err == 0) {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory hole found");
+ return ERR_PTR(-EIO);
+ }
+ __ext4_warning(inode->i_sb, __func__, line,
+ "error reading directory block "
+ "(ino %lu, block %lu)", inode->i_ino,
+ (unsigned long) block);
+ return ERR_PTR(err);
+ }
+ dirent = (struct ext4_dir_entry *) bh->b_data;
+ /* Determine whether or not we have an index block */
+ if (is_dx(inode)) {
+ if (block == 0)
+ is_dx_block = 1;
+ else if (ext4_rec_len_from_disk(dirent->rec_len,
+ inode->i_sb->s_blocksize) ==
+ inode->i_sb->s_blocksize)
+ is_dx_block = 1;
+ }
+ if (!is_dx_block && type == INDEX) {
+ ext4_error_inode(inode, __func__, line, block,
+ "directory leaf block found instead of index block");
+ return ERR_PTR(-EIO);
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ buffer_verified(bh))
+ return bh;
+
+ /*
+ * An empty leaf block can get mistaken for a index block; for
+ * this reason, we can only check the index checksum when the
+ * caller is sure it should be an index block.
+ */
+ if (is_dx_block && type == INDEX) {
+ if (ext4_dx_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory index failed checksum");
+ brelse(bh);
+ return ERR_PTR(-EIO);
+ }
+ }
+ if (!is_dx_block) {
+ if (ext4_dirent_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory block failed checksum");
+ brelse(bh);
+ return ERR_PTR(-EIO);
+ }
}
return bh;
}
@@ -69,10 +161,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
#define assert(test) J_ASSERT(test)
#endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
@@ -144,39 +232,281 @@ struct dx_map_entry
u16 size;
};
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+ u32 dt_reserved;
+ __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame,
int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
- struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
+ struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
__u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
- struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *err);
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
+/* checksumming functions */
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize)
+{
+ memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+ t->det_rec_len = ext4_rec_len_to_disk(
+ sizeof(struct ext4_dir_entry_tail), blocksize);
+ t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+ struct ext4_dir_entry *de)
+{
+ struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+ struct ext4_dir_entry *d, *top;
+
+ d = de;
+ top = (struct ext4_dir_entry *)(((void *)de) +
+ (EXT4_BLOCK_SIZE(inode->i_sb) -
+ sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && d->rec_len)
+ d = (struct ext4_dir_entry *)(((void *)d) +
+ le16_to_cpu(d->rec_len));
+
+ if (d != top)
+ return NULL;
+
+ t = (struct ext4_dir_entry_tail *)d;
+#else
+ t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+ if (t->det_reserved_zero1 ||
+ le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ t->det_reserved_zero2 ||
+ t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+ return NULL;
+
+ return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+ struct ext4_dir_entry *dirent, int size)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ return cpu_to_le32(csum);
+}
+
+static void warn_no_space_for_csum(struct inode *inode)
+{
+ ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+ "checksum. Please run e2fsck -D.", inode->i_ino);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ warn_no_space_for_csum(inode);
+ return 0;
+ }
+
+ if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent))
+ return 0;
+
+ return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct ext4_dir_entry_tail *t;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ t = get_dirent_tail(inode, dirent);
+ if (!t) {
+ warn_no_space_for_csum(inode);
+ return;
+ }
+
+ t->det_checksum = ext4_dirent_csum(inode, dirent,
+ (void *)t - (void *)dirent);
+}
+
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+ struct ext4_dir_entry *dirent,
+ int *offset)
+{
+ struct ext4_dir_entry *dp;
+ struct dx_root_info *root;
+ int count_offset;
+
+ if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ count_offset = 8;
+ else if (le16_to_cpu(dirent->rec_len) == 12) {
+ dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+ if (le16_to_cpu(dp->rec_len) !=
+ EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ return NULL;
+ root = (struct dx_root_info *)(((void *)dp + 12));
+ if (root->reserved_zero ||
+ root->info_length != sizeof(struct dx_root_info))
+ return NULL;
+ count_offset = 32;
+ } else
+ return NULL;
+
+ if (offset)
+ *offset = count_offset;
+ return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+ int count_offset, int count, struct dx_tail *t)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __u32 csum;
+ __le32 save_csum;
+ int size;
+
+ size = count_offset + (count * sizeof(struct dx_entry));
+ save_csum = t->dt_checksum;
+ t->dt_checksum = 0;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+ t->dt_checksum = save_csum;
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return 1;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ warn_no_space_for_csum(inode);
+ return 1;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+ count, t))
+ return 0;
+ return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+ struct dx_countlimit *c;
+ struct dx_tail *t;
+ int count_offset, limit, count;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return;
+ }
+ limit = le16_to_cpu(c->limit);
+ count = le16_to_cpu(c->count);
+ if (count_offset + (limit * sizeof(struct dx_entry)) >
+ EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+ warn_no_space_for_csum(inode);
+ return;
+ }
+ t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+ t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len, blocksize));
+}
+
/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
@@ -192,59 +522,67 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
entry->block = cpu_to_le32(value);
}
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
{
return le32_to_cpu(entry->hash);
}
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
{
entry->hash = cpu_to_le32(value);
}
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
{
return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
{
return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
{
((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
{
((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
+ return entry_space / sizeof(struct dx_entry);
}
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ entry_space -= sizeof(struct dx_tail);
+ return entry_space / sizeof(struct dx_entry);
}
/*
* Debug
*/
#ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
{
int i, n = dx_get_count (entries);
- printk("%s index ", label);
+ printk(KERN_DEBUG "%s index ", label);
for (i = 0; i < n; i++) {
- printk("%x->%lu ", i? dx_get_hash(entries + i) :
+ printk("%x->%lu ", i ? dx_get_hash(entries + i) :
0, (unsigned long)dx_get_block(entries + i));
}
printk("\n");
@@ -276,12 +614,12 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
while (len--) printk("%c", *name++);
ext4fs_dirhash(de->name, de->name_len, &h);
printk(":%x.%u ", h.hash,
- ((char *) de - base));
+ (unsigned) ((char *) de - base));
}
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
}
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, size);
}
printk("(%i)\n", names);
return (struct stats) { names, space, 1 };
@@ -291,7 +629,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
struct dx_entry *entries, int levels)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+ unsigned count = dx_get_count(entries), names = 0, space = 0, i;
unsigned bcount = 0;
struct buffer_head *bh;
int err;
@@ -310,11 +648,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
names += stats.names;
space += stats.space;
bcount += stats.bcount;
- brelse (bh);
+ brelse(bh);
}
if (bcount)
- printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
- names, space/bcount,(space/bcount)*100/blocksize);
+ printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+ levels ? "" : " ", names, space/bcount,
+ (space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
}
#endif /* DX_DEBUG */
@@ -329,7 +668,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
* back to userspace.
*/
static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
unsigned count, indirect;
@@ -340,30 +679,31 @@ dx_probe(struct dentry *dentry, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (dentry)
- dir = dentry->d_parent->d_inode;
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
root->info.hash_version != DX_HASH_LEGACY) {
- ext4_warning(dir->i_sb, __FUNCTION__,
- "Unrecognised inode hash code %d",
+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
root->info.hash_version);
brelse(bh);
*err = ERR_BAD_DX_DIR;
goto fail;
}
hinfo->hash_version = root->info.hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (dentry)
- ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+ if (d_name)
+ ext4fs_dirhash(d_name->name, d_name->len, hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
- ext4_warning(dir->i_sb, __FUNCTION__,
- "Unimplemented inode hash flags: %#06x",
+ ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
root->info.unused_flags);
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -371,8 +711,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
}
if ((indirect = root->info.indirect_levels) > 1) {
- ext4_warning(dir->i_sb, __FUNCTION__,
- "Unimplemented inode hash depth: %#06x",
+ ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
root->info.indirect_levels);
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -384,19 +723,18 @@ dx_probe(struct dentry *dentry, struct inode *dir,
if (dx_get_limit(entries) != dx_root_limit(dir,
root->info.info_length)) {
- ext4_warning(dir->i_sb, __FUNCTION__,
- "dx entry: limit != root limit");
+ ext4_warning(dir->i_sb, "dx entry: limit != root limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
goto fail;
}
- dxtrace (printk("Look up %x", hash));
+ dxtrace(printk("Look up %x", hash));
while (1)
{
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
- ext4_warning(dir->i_sb, __FUNCTION__,
+ ext4_warning(dir->i_sb,
"dx entry: no count or count > limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -437,11 +775,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail2;
- at = entries = ((struct dx_node *) bh->b_data)->entries;
+ }
+ entries = ((struct dx_node *) bh->b_data)->entries;
+
if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext4_warning(dir->i_sb, __FUNCTION__,
+ ext4_warning(dir->i_sb,
"dx entry: limit != node limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -457,8 +799,8 @@ fail2:
}
fail:
if (*err == ERR_BAD_DX_DIR)
- ext4_warning(dir->i_sb, __FUNCTION__,
- "Corrupt dir inode %ld, running e2fsck is "
+ ext4_warning(dir->i_sb,
+ "Corrupt dir inode %lu, running e2fsck is "
"recommended.", dir->i_ino);
return NULL;
}
@@ -497,7 +839,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
{
struct dx_frame *p;
struct buffer_head *bh;
- int err, num_frames = 0;
+ int num_frames = 0;
__u32 bhash;
p = frame;
@@ -536,11 +878,11 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
- return err; /* Failure */
+ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
p++;
- brelse (p->bh);
+ brelse(p->bh);
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
@@ -549,15 +891,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -569,26 +902,25 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
- return err;
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de)) {
- if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
- +((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size,
+ (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ + ((char *)de - bh->b_data))) {
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -629,12 +961,26 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int ret, err;
__u32 hashval;
- dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
- start_minor_hash));
- dir = dir_file->f_path.dentry->d_inode;
- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ start_hash, start_minor_hash));
+ dir = file_inode(dir_file);
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ count = htree_inlinedir_to_tree(dir_file, dir, 0,
+ &hinfo, start_hash,
+ start_minor_hash,
+ &has_inline_data);
+ if (has_inline_data) {
+ *next_hash = ~0;
+ return count;
+ }
+ }
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
@@ -642,7 +988,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+ frame = dx_probe(NULL, dir, &hinfo, frames, &err);
if (!frame)
return err;
@@ -655,7 +1001,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
goto errout;
count++;
@@ -688,14 +1034,23 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
break;
}
dx_release(frames);
- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
- count, *next_hash));
+ dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+ "next hash: %x\n", count, *next_hash));
return count;
errout:
dx_release(frames);
return (err);
}
+static inline int search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
+{
+ return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ d_name, offset, res_dir);
+}
/*
* Directory block splitting, compacting
@@ -705,26 +1060,26 @@ errout:
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail)
{
int count = 0;
char *base = (char *) de;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + size)
- {
+ while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
ext4fs_dirhash(de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
- map_tail->offs = (u16) ((char *) de - base);
+ map_tail->offs = ((char *) de - base)>>2;
map_tail->size = le16_to_cpu(de->rec_len);
count++;
cond_resched();
}
/* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -770,13 +1125,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-static void ext4_update_dx_flag(struct inode *inode)
-{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
-}
-
/*
* NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
*
@@ -796,20 +1144,22 @@ static inline int ext4_match (int len, const char * const name,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static inline int search_dirblock(struct buffer_head * bh,
- struct inode *dir,
- struct dentry *dentry,
- unsigned long offset,
- struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
+ const char *name = d_name->name;
+ int namelen = d_name->len;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ de = (struct ext4_dir_entry_2 *)search_buf;
+ dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
@@ -817,14 +1167,15 @@ static inline int search_dirblock(struct buffer_head * bh,
if ((char *) de + namelen <= dlimit &&
ext4_match (namelen, name, de)) {
/* found a match - just to be sure, do a full check */
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
return -1;
*res_dir = de;
return 1;
}
/* prevent looping on a bad block */
- de_len = ext4_rec_len_from_disk(de->rec_len);
+ de_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (de_len <= 0)
return -1;
offset += de_len;
@@ -833,6 +1184,21 @@ static inline int search_dirblock(struct buffer_head * bh,
return 0;
}
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+ struct ext4_dir_entry *de)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (!is_dx(dir))
+ return 0;
+ if (block == 0)
+ return 1;
+ if (de->inode == 0 &&
+ ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+ sb->s_blocksize)
+ return 1;
+ return 0;
+}
/*
* ext4_find_entry()
@@ -845,13 +1211,16 @@ static inline int search_dirblock(struct buffer_head * bh,
* The returned buffer_head has ->b_count elevated. The caller is expected
* to brelse() it when appropriate.
*/
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
- struct ext4_dir_entry_2 ** res_dir)
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
{
- struct super_block * sb;
- struct buffer_head * bh_use[NAMEI_RA_SIZE];
- struct buffer_head * bh, *ret = NULL;
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+ struct buffer_head *bh, *ret = NULL;
ext4_lblk_t start, block, b;
+ const u8 *name = d_name->name;
int ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
int ra_ptr = 0; /* Current index into readahead
@@ -859,16 +1228,37 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
int num = 0;
ext4_lblk_t nblocks;
int i, err;
- struct inode *dir = dentry->d_parent->d_inode;
int namelen;
*res_dir = NULL;
sb = dir->i_sb;
- namelen = dentry->d_name.len;
+ namelen = d_name->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ &has_inline_data);
+ if (has_inline_data) {
+ if (inlined)
+ *inlined = 1;
+ return ret;
+ }
+ }
+
+ if ((namelen <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == '\0')) {
+ /*
+ * "." or ".." will only be in the first block
+ * NFS may look up ".."; "." should be handled by the VFS
+ */
+ block = start = 0;
+ nblocks = 1;
+ goto restart;
+ }
if (is_dx(dir)) {
- bh = ext4_dx_find_entry(dentry, res_dir, &err);
+ bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -876,7 +1266,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
*/
if (bh || (err != ERR_BAD_DX_DIR))
return bh;
- dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
start = EXT4_I(dir)->i_dir_start_lookup;
@@ -906,7 +1297,8 @@ restart:
bh = ext4_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
if (bh)
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO,
+ 1, &bh);
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -914,13 +1306,23 @@ restart:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* read error, skip block & hope for the best */
- ext4_error(sb, __FUNCTION__, "reading directory #%lu "
- "offset %lu", dir->i_ino,
- (unsigned long)block);
+ EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
goto next;
}
- i = search_dirblock(bh, dir, dentry,
+ if (!buffer_verified(bh) &&
+ !is_dx_internal_node(dir, block,
+ (struct ext4_dir_entry *)bh->b_data) &&
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ EXT4_ERROR_INODE(dir, "checksumming directory "
+ "block %lu", (unsigned long)block);
+ brelse(bh);
+ goto next;
+ }
+ set_buffer_verified(bh);
+ i = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -950,64 +1352,47 @@ restart:
cleanup_and_exit:
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
- brelse (bh_use[ra_ptr]);
+ brelse(bh_use[ra_ptr]);
return ret;
}
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir, int *err)
{
- struct super_block * sb;
+ struct super_block * sb = dir->i_sb;
struct dx_hash_info hinfo;
- u32 hash;
struct dx_frame frames[2], *frame;
- struct ext4_dir_entry_2 *de, *top;
struct buffer_head *bh;
ext4_lblk_t block;
int retval;
- int namelen = dentry->d_name.len;
- const u8 *name = dentry->d_name.name;
- struct inode *dir = dentry->d_parent->d_inode;
- sb = dir->i_sb;
- /* NFS may look up ".." - look at dx_root directory block */
- if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
- return NULL;
- } else {
- frame = frames;
- frame->bh = NULL; /* for dx_release() */
- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
- dx_set_block(frame->at, 0); /* dx_root block is 0 */
- }
- hash = hinfo.hash;
+ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+ return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto errout;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
- EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
- *err = ERR_BAD_DX_DIR;
- goto errout;
- }
- *res_dir = de;
- dx_release (frames);
+ }
+ retval = search_dirblock(bh, dir, d_name,
+ block << EXT4_BLOCK_SIZE_BITS(sb),
+ res_dir);
+ if (retval == 1) { /* Success! */
+ dx_release(frames);
return bh;
}
- brelse (bh);
+ brelse(bh);
+ if (retval == -1) {
+ *err = ERR_BAD_DX_DIR;
+ goto errout;
+ }
+
/* Check to see if we should continue to search */
- retval = ext4_htree_next_block(dir, hash, frame,
+ retval = ext4_htree_next_block(dir, hinfo.hash, frame,
frames, NULL);
if (retval < 0) {
- ext4_warning(sb, __FUNCTION__,
+ ext4_warning(sb,
"error reading index page in directory #%lu",
dir->i_ino);
*err = retval;
@@ -1017,33 +1402,41 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
*err = -ENOENT;
errout:
- dxtrace(printk("%s not found\n", name));
+ dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
dx_release (frames);
return NULL;
}
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
- struct inode * inode;
- struct ext4_dir_entry_2 * de;
- struct buffer_head * bh;
+ struct inode *inode;
+ struct ext4_dir_entry_2 *de;
+ struct buffer_head *bh;
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
inode = NULL;
if (bh) {
- unsigned long ino = le32_to_cpu(de->inode);
- brelse (bh);
+ __u32 ino = le32_to_cpu(de->inode);
+ brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
- ext4_error(dir->i_sb, "ext4_lookup",
- "bad inode number: %lu", ino);
+ EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
+ return ERR_PTR(-EIO);
+ }
+ if (unlikely(ino == dir->i_ino)) {
+ EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+ dentry);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (inode == ERR_PTR(-ESTALE)) {
+ EXT4_ERROR_INODE(dir,
+ "deleted inode referenced: %u",
+ ino);
+ return ERR_PTR(-EIO);
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1051,58 +1444,24 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
struct dentry *ext4_get_parent(struct dentry *child)
{
- unsigned long ino;
- struct dentry *parent;
- struct inode *inode;
- struct dentry dotdot;
+ __u32 ino;
+ static const struct qstr dotdot = QSTR_INIT("..", 2);
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- dotdot.d_name.name = "..";
- dotdot.d_name.len = 2;
- dotdot.d_parent = child; /* confusing, isn't it! */
-
- bh = ext4_find_entry(&dotdot, &de);
- inode = NULL;
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
- ext4_error(child->d_inode->i_sb, "ext4_get_parent",
- "bad inode number: %lu", ino);
+ EXT4_ERROR_INODE(child->d_inode,
+ "bad parent inode number: %u", ino);
return ERR_PTR(-EIO);
}
- inode = ext4_iget(child->d_inode->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- parent = d_alloc_anon(inode);
- if (!parent) {
- iput(inode);
- parent = ERR_PTR(-ENOMEM);
- }
- return parent;
-}
-
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
- struct ext4_dir_entry_2 *de,
- umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
/*
@@ -1110,16 +1469,18 @@ static inline void ext4_set_de_type(struct super_block *sb,
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+ unsigned blocksize)
{
unsigned rec_len = 0;
while (count--) {
- struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ (from + (map->offs<<2));
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
- ext4_rec_len_to_disk(rec_len);
+ ext4_rec_len_to_disk(rec_len, blocksize);
de->inode = 0;
map++;
to += rec_len;
@@ -1131,19 +1492,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
prev = to = de;
- while ((char*)de < base + size) {
- next = ext4_next_entry(de);
+ while ((char*)de < base + blocksize) {
+ next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
rec_len = EXT4_DIR_REC_LEN(de->name_len);
if (de > to)
memmove(to, de, rec_len);
- to->rec_len = ext4_rec_len_to_disk(rec_len);
+ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
prev = to;
to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
}
@@ -1168,15 +1529,22 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
u32 hash2;
struct dx_map_entry *map;
char *data1 = (*bh)->b_data, *data2;
- unsigned split, move, size, i;
+ unsigned split, move, size;
struct ext4_dir_entry_2 *de = NULL, *de2;
- int err = 0;
+ struct ext4_dir_entry_tail *t;
+ int csum_size = 0;
+ int err = 0, i;
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2)) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- goto errout;
+ *error = PTR_ERR(bh2);
+ return NULL;
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -1193,10 +1561,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+ count = dx_make_map((struct ext4_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
- dx_sort_map (map, count);
+ dx_sort_map(map, count);
/* Split the existing block in the middle, size-wise */
size = 0;
move = 0;
@@ -1216,10 +1584,22 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split);
- de = dx_pack_dirents(data1,blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
- de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+ de = dx_pack_dirents(data1, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
+ blocksize);
+ de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+ (char *) de2,
+ blocksize);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data2, blocksize);
+ initialize_dirent_tail(t, blocksize);
+
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1229,15 +1609,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
swap(*bh, bh2);
de = de2;
}
- dx_insert_block (frame, hash2 + continued, newblock);
- err = ext4_journal_dirty_metadata (handle, bh2);
+ dx_insert_block(frame, hash2 + continued, newblock);
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
if (err)
goto journal_error;
- err = ext4_journal_dirty_metadata (handle, frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
- brelse (bh2);
- dxtrace(dx_show_index ("frame", frame->entries));
+ brelse(bh2);
+ dxtrace(dx_show_index("frame", frame->entries));
return de;
journal_error:
@@ -1245,11 +1625,67 @@ journal_error:
brelse(bh2);
*bh = NULL;
ext4_std_error(dir->i_sb, err);
-errout:
*error = err;
return NULL;
}
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de)
+{
+ struct ext4_dir_entry_2 *de;
+ unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ int nlen, rlen;
+ unsigned int offset = 0;
+ char *top;
+
+ de = (struct ext4_dir_entry_2 *)buf;
+ top = buf + buf_size - reclen;
+ while ((char *) de <= top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ buf, buf_size, offset))
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if ((de->inode ? rlen - nlen : rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
+}
+
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen)
+{
+
+ int nlen, rlen;
+
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 =
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+ de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+}
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1257,71 +1693,39 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
- struct buffer_head * bh)
+ struct buffer_head *bh)
{
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned long offset = 0;
- unsigned short reclen;
- int nlen, rlen, err;
- char *top;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ int csum_size = 0;
+ int err;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
- reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
- de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + dir->i_sb->s_blocksize - reclen;
- while ((char *) de <= top) {
- if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
- bh, offset)) {
- brelse (bh);
- return -EIO;
- }
- if (ext4_match (namelen, name, de)) {
- brelse (bh);
- return -EEXIST;
- }
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
- de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
- offset += rlen;
- }
- if ((char *) de > top)
- return -ENOSPC;
+ err = ext4_find_dest_de(dir, inode,
+ bh, bh->b_data, blocksize - csum_size,
+ name, namelen, &de);
+ if (err)
+ return err;
}
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return err;
}
/* By now the buffer is marked for journaling */
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
- if (de->inode) {
- struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
- de->rec_len = ext4_rec_len_to_disk(nlen);
- de = de1;
- }
- de->file_type = EXT4_FT_UNKNOWN;
- if (inode) {
- de->inode = cpu_to_le32(inode->i_ino);
- ext4_set_de_type(dir->i_sb, de, inode->i_mode);
- } else
- de->inode = 0;
- de->name_len = namelen;
- memcpy (de->name, name, namelen);
+ ext4_insert_dentry(inode, de, blocksize, name, namelen);
+
/*
* XXX shouldn't update any times until successful
* completion of syscall, but too many callers depend
@@ -1337,11 +1741,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return 0;
}
@@ -1360,6 +1763,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
+ struct ext4_dir_entry_tail *t;
char *data1, *top;
unsigned len;
int retval;
@@ -1367,9 +1771,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct dx_hash_info hinfo;
ext4_lblk_t block;
struct fake_dirent *fde;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
- dxtrace(printk("Creating index\n"));
+ dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+ BUFFER_TRACE(bh, "get_write_access");
retval = ext4_journal_get_write_access(handle, bh);
if (retval) {
ext4_std_error(dir->i_sb, retval);
@@ -1378,38 +1788,56 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
}
root = (struct dx_root *) bh->b_data;
- bh2 = ext4_append (handle, dir, &block, &retval);
- if (!(bh2)) {
+ /* The 0th block becomes the root, move the dirents out */
+ fde = &root->dotdot;
+ de = (struct ext4_dir_entry_2 *)((char *)fde +
+ ext4_rec_len_from_disk(fde->rec_len, blocksize));
+ if ((char *) de >= (((char *) root) + blocksize)) {
+ EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
- return retval;
+ return -EIO;
}
- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ len = ((char *) root) + (blocksize - csum_size) - (char *) de;
+
+ /* Allocate new block for the 0th block's dirents */
+ bh2 = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh2)) {
+ brelse(bh);
+ return PTR_ERR(bh2);
+ }
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
- /* The 0th block becomes the root, move the dirents out */
- fde = &root->dotdot;
- de = (struct ext4_dir_entry_2 *)((char *)fde +
- ext4_rec_len_from_disk(fde->rec_len));
- len = ((char *) root) + blocksize - (char *) de;
memcpy (data1, de, len);
de = (struct ext4_dir_entry_2 *) data1;
top = data1 + len;
- while ((char *)(de2 = ext4_next_entry(de)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+ (char *) de,
+ blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(data1, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+ blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
entries = root->entries;
- dx_set_block (entries, 1);
- dx_set_count (entries, 1);
- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+ dx_set_block(entries, 1);
+ dx_set_count(entries, 1);
+ dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
/* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
ext4fs_dirhash(name, namelen, &hinfo);
frame = frames;
@@ -1417,12 +1845,26 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+
+ ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ ext4_handle_dirty_dirent_node(handle, dir, bh);
+
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1435,52 +1877,81 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
* may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
{
struct inode *dir = dentry->d_parent->d_inode;
- unsigned long offset;
- struct buffer_head * bh;
+ struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- struct super_block * sb;
+ struct ext4_dir_entry_tail *t;
+ struct super_block *sb;
int retval;
int dx_fallback=0;
unsigned blocksize;
ext4_lblk_t block, blocks;
+ int csum_size = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
+
+ if (ext4_has_inline_data(dir)) {
+ retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ if (retval < 0)
+ return retval;
+ if (retval == 1) {
+ retval = 0;
+ return retval;
+ }
+ }
+
if (is_dx(dir)) {
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
return retval;
- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
}
blocks = dir->i_size >> sb->s_blocksize_bits;
- for (block = 0, offset = 0; block < blocks; block++) {
- bh = ext4_bread(handle, dir, block, 0, &retval);
- if(!bh)
- return retval;
+ for (block = 0; block < blocks; block++) {
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
+ if (retval != -ENOSPC) {
+ brelse(bh);
return retval;
+ }
if (blocks == 1 && !dx_fallback &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
- bh = ext4_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
+ bh = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = ext4_rec_len_to_disk(blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
}
/*
@@ -1492,20 +1963,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
struct dx_hash_info hinfo;
- struct buffer_head * bh;
+ struct buffer_head *bh;
struct inode *dir = dentry->d_parent->d_inode;
- struct super_block * sb = dir->i_sb;
+ struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
- frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
if (!frame)
return err;
entries = frame->entries;
at = frame->at;
-
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
goto cleanup;
+ }
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
@@ -1513,13 +1987,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto journal_error;
err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
+ if (err != -ENOSPC)
goto cleanup;
- }
/* Block full, should compress but for now just split */
- dxtrace(printk("using %u of %u node entries\n",
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1532,18 +2004,20 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels && (dx_get_count(frames->entries) ==
dx_get_limit(frames->entries))) {
- ext4_warning(sb, __FUNCTION__,
- "Directory index full!");
+ ext4_warning(sb, "Directory index full!");
err = -ENOSPC;
goto cleanup;
}
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
goto cleanup;
+ }
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
- node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
- node2->fake.inode = 0;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
+ node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+ sb->s_blocksize);
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1551,7 +2025,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+ icount1, icount2));
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
@@ -1559,11 +2034,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
- memcpy ((char *) entries2, (char *) (entries + icount1),
- icount2 * sizeof(struct dx_entry));
- dx_set_count (entries, icount1);
- dx_set_count (entries2, icount2);
- dx_set_limit (entries2, dx_node_limit(dir));
+ memcpy((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+ dx_set_count(entries, icount1);
+ dx_set_count(entries2, icount2);
+ dx_set_limit(entries2, dx_node_limit(dir));
/* Which index block gets the new entry? */
if (at - entries >= icount1) {
@@ -1571,16 +2046,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block (frames + 0, hash2, newblock);
- dxtrace(dx_show_index ("node", frames[1].entries));
- dxtrace(dx_show_index ("node",
+ dx_insert_block(frames + 0, hash2, newblock);
+ dxtrace(dx_show_index("node", frames[1].entries));
+ dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
- err = ext4_journal_dirty_metadata(handle, bh2);
+ err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
goto journal_error;
brelse (bh2);
} else {
- dxtrace(printk("Creating second level index...\n"));
+ dxtrace(printk(KERN_DEBUG
+ "Creating second level index...\n"));
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -1600,63 +2076,111 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- ext4_journal_dirty_metadata(handle, frames[0].bh);
+ err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ goto cleanup;
+ }
}
de = do_split(handle, dir, &bh, frame, &hinfo, &err);
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
goto cleanup;
journal_error:
ext4_std_error(dir->i_sb, err);
cleanup:
- if (bh)
- brelse(bh);
+ brelse(bh);
dx_release(frames);
return err;
}
/*
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
*/
-static int ext4_delete_entry (handle_t *handle,
- struct inode * dir,
- struct ext4_dir_entry_2 * de_del,
- struct buffer_head * bh)
+int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size)
{
- struct ext4_dir_entry_2 * de, * pde;
+ struct ext4_dir_entry_2 *de, *pde;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
int i;
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size) {
- if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+ de = (struct ext4_dir_entry_2 *)entry_buf;
+ while (i < buf_size - csum_size) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size, i))
return -EIO;
if (de == de_del) {
- BUFFER_TRACE(bh, "get_write_access");
- ext4_journal_get_write_access(handle, bh);
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
- ext4_rec_len_from_disk(pde->rec_len) +
- ext4_rec_len_from_disk(de->rec_len));
+ ext4_rec_len_from_disk(pde->rec_len,
+ blocksize) +
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize),
+ blocksize);
else
de->inode = 0;
dir->i_version++;
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, bh);
return 0;
}
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len, blocksize);
pde = de;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return -ENOENT;
}
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
+{
+ int err, csum_size = 0;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+ &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del,
+ bh, bh->b_data,
+ dir->i_sb->s_blocksize, csum_size);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (unlikely(err))
+ goto out;
+
+ return 0;
+out:
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
/*
* DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
* since this indicates that nlinks count was previously 1.
@@ -1667,7 +2191,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
if (is_dx(inode) && inode->i_nlink > 1) {
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
}
@@ -1680,9 +2204,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
*/
static void ext4_dec_count(handle_t *handle, struct inode *inode)
{
- drop_nlink(inode);
- if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
- inc_nlink(inode);
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
}
@@ -1692,10 +2215,12 @@ static int ext4_add_nondir(handle_t *handle,
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
drop_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -1708,138 +2233,236 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
handle_t *handle;
- struct inode * inode;
- int err, retries = 0;
-
-retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ struct inode *inode;
+ int err, credits, retries = 0;
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ dquot_initialize(dir);
- inode = ext4_new_inode (handle, dir, mode);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (!new_valid_dev(rdev))
return -EINVAL;
-retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ dquot_initialize(dir);
- inode = ext4_new_inode (handle, dir, mode);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
-#endif
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
handle_t *handle;
- struct inode * inode;
- struct buffer_head * dir_block;
- struct ext4_dir_entry_2 * de;
+ struct inode *inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ inode = ext4_new_inode_start_handle(dir, mode,
+ NULL, 0, NULL,
+ EXT4_HT_DIR,
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT4_XATTR_TRANS_BLOCKS);
+ handle = ext4_journal_current_handle();
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ d_tmpfile(dentry, inode);
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto err_unlock_inode;
+ mark_inode_dirty(inode);
+ unlock_new_inode(inode);
+ }
+ if (handle)
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_unlock_inode:
+ ext4_journal_stop(handle);
+ unlock_new_inode(inode);
+ return err;
+}
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len)
+{
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
+ strcpy(de->name, ".");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ de = ext4_next_entry(de, blocksize);
+ de->inode = cpu_to_le32(parent_ino);
+ de->name_len = 2;
+ if (!dotdot_real_len)
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + EXT4_DIR_REC_LEN(1)),
+ blocksize);
+ else
+ de->rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(de->name_len), blocksize);
+ strcpy(de->name, "..");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode)
+{
+ struct buffer_head *dir_block = NULL;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ ext4_lblk_t block = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ int csum_size = 0;
+ int err;
- inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ err = ext4_try_create_inline_dir(handle, dir, inode);
+ if (err < 0 && err != -ENOSPC)
+ goto out;
+ if (!err)
+ goto out;
+ }
+
+ inode->i_size = 0;
+ dir_block = ext4_append(handle, inode, &block);
+ if (IS_ERR(dir_block))
+ return PTR_ERR(dir_block);
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out;
+ de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+ ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+ set_nlink(inode, 2);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ brelse(dir_block);
+ return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, credits, retries = 0;
+
+ if (EXT4_DIR_LINK_MAX(dir))
+ return -EMLINK;
+
+ dquot_initialize(dir);
+
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+ inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread (handle, inode, 0, 1, &err);
- if (!dir_block)
+ err = ext4_init_new_dir(handle, dir, inode);
+ if (err)
goto out_clear_inode;
- BUFFER_TRACE(dir_block, "get_write_access");
- ext4_journal_get_write_access(handle, dir_block);
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
- de->inode = cpu_to_le32(inode->i_ino);
- de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
- strcpy (de->name, ".");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext4_next_entry(de);
- de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(1));
- de->name_len = 2;
- strcpy (de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
- BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, dir_block);
- brelse (dir_block);
- ext4_mark_inode_dirty(handle, inode);
- err = ext4_add_entry (handle, dentry, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (!err)
+ err = ext4_add_entry(handle, dentry, inode);
if (err) {
out_clear_inode:
clear_nlink(inode);
+ unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
- iput (inode);
+ iput(inode);
goto out_stop;
}
ext4_inc_count(handle, dir);
ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (err)
+ goto out_clear_inode;
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -1848,141 +2471,161 @@ out_stop:
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
{
- unsigned long offset;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de, * de1;
- struct super_block * sb;
+ unsigned int offset;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de, *de1;
+ struct super_block *sb;
int err = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+
+ err = empty_inline_dir(inode, &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
- if (err)
- ext4_error(inode->i_sb, __FUNCTION__,
- "error %d reading directory #%lu offset 0",
- err, inode->i_ino);
- else
- ext4_warning(inode->i_sb, __FUNCTION__,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ EXT4_ERROR_INODE(inode, "invalid size");
return 1;
}
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh))
+ return 1;
+
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de);
+ de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
!le32_to_cpu(de1->inode) ||
- strcmp (".", de->name) ||
- strcmp ("..", de1->name)) {
- ext4_warning (inode->i_sb, "empty_dir",
- "bad directory (dir #%lu) - no `.' or `..'",
- inode->i_ino);
- brelse (bh);
+ strcmp(".", de->name) ||
+ strcmp("..", de1->name)) {
+ ext4_warning(inode->i_sb,
+ "bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ brelse(bh);
return 1;
}
- offset = ext4_rec_len_from_disk(de->rec_len) +
- ext4_rec_len_from_disk(de1->rec_len);
- de = ext4_next_entry(de1);
- while (offset < inode->i_size ) {
- if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+ ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de1, sb->s_blocksize);
+ while (offset < inode->i_size) {
+ if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ unsigned int lblock;
err = 0;
- brelse (bh);
- bh = ext4_bread (NULL, inode,
- offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
- if (!bh) {
- if (err)
- ext4_error(sb, __FUNCTION__,
- "error %d reading directory"
- " #%lu offset %lu",
- err, inode->i_ino, offset);
- offset += sb->s_blocksize;
- continue;
- }
+ brelse(bh);
+ lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (IS_ERR(bh))
+ return 1;
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
- if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+ if (ext4_check_dir_entry(inode, NULL, de, bh,
+ bh->b_data, bh->b_size, offset)) {
de = (struct ext4_dir_entry_2 *)(bh->b_data +
sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
continue;
}
if (le32_to_cpu(de->inode)) {
- brelse (bh);
+ brelse(bh);
return 0;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
- de = ext4_next_entry(de);
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
}
- brelse (bh);
+ brelse(bh);
return 1;
}
-/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
* such inodes, starting at the superblock, in case we crash before the
* file is closed/deleted, or in case the inode truncate spans multiple
* transactions and the last transaction is not recovered after a crash.
*
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_iloc iloc;
int err = 0, rc;
+ bool dirty = false;
- lock_super(sb);
- if (!list_empty(&EXT4_I(inode)->i_orphan))
- goto out_unlock;
+ if (!sbi->s_journal)
+ return 0;
- /* Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. */
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /*
+ * Exit early if inode already is on orphan list. This is a big speedup
+ * since we don't have to contend on the global s_orphan_lock.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan))
+ return 0;
- /* @@@ FIXME: Observation from aviro:
- * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
- * here (on lock_super()), so race with ext4_link() which might bump
- * ->i_nlink. For, say it, character device. Not a regular file,
- * not a directory, not a symlink and ->i_nlink > 0.
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_mutex, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
*/
- J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
- goto out_unlock;
+ goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- goto out_unlock;
-
- /* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
- EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
-
- /* Only add to the head of the in-memory list if all the
- * previous operations succeeded. If the orphan_add is going to
- * fail (possibly taking the journal offline), we can't risk
- * leaving the inode on the orphan list: stray orphan-list
- * entries can cause panics at unmount time.
- *
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
- if (!err)
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ goto out;
+ mutex_lock(&sbi->s_orphan_lock);
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_super(handle, sb);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ }
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
- unlock_super(sb);
- ext4_std_error(inode->i_sb, err);
+out:
+ ext4_std_error(sb, err);
return err;
}
@@ -1994,66 +2637,74 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi;
- unsigned long ino_next;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 ino_next;
struct ext4_iloc iloc;
int err = 0;
- lock_super(inode->i_sb);
- if (list_empty(&ei->i_orphan)) {
- unlock_super(inode->i_sb);
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- }
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
- sbi = EXT4_SB(inode->i_sb);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /* Do this quick check before taking global s_orphan_lock. */
+ if (list_empty(&ei->i_orphan))
+ return 0;
+
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+ mutex_lock(&sbi->s_orphan_lock);
jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
/* If we're on an error path, we may not have a valid
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (!handle)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_err;
+ }
+ ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %lu\n", ino_next);
+ jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
- jbd_debug(4, "orphan inode %lu will point to %lu\n",
+ jbd_debug(4, "orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
NEXT_ORPHAN(i_prev) = ino_next;
err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
}
if (err)
goto out_brelse;
NEXT_ORPHAN(inode) = 0;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
out_err:
ext4_std_error(inode->i_sb, err);
-out:
- unlock_super(inode->i_sb);
return err;
out_brelse:
@@ -2061,29 +2712,24 @@ out_brelse:
goto out_err;
}
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de;
- handle_t *handle;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ handle_t *handle = NULL;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- DQUOT_INIT(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
retval = -ENOENT;
- bh = ext4_find_entry (dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
inode = dentry->d_inode;
retval = -EIO;
@@ -2091,16 +2737,27 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
goto end_rmdir;
retval = -ENOTEMPTY;
- if (!empty_dir (inode))
+ if (!empty_dir(inode))
goto end_rmdir;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rmdir;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
- ext4_warning (inode->i_sb, "ext4_rmdir",
- "empty directory has too many links (%d)",
- inode->i_nlink);
+ ext4_warning(inode->i_sb,
+ "empty directory has too many links (%d)",
+ inode->i_nlink);
inode->i_version++;
clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2115,31 +2772,28 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
ext4_mark_inode_dirty(handle, dir);
end_rmdir:
- ext4_journal_stop(handle);
- brelse (bh);
+ brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de;
- handle_t *handle;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ handle_t *handle = NULL;
+ trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- DQUOT_INIT(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
retval = -ENOENT;
- bh = ext4_find_entry (dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_unlink;
@@ -2149,11 +2803,22 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_unlink;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
if (!inode->i_nlink) {
- ext4_warning (inode->i_sb, "ext4_unlink",
- "Deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ ext4_warning(inode->i_sb,
+ "Deleting nonexistent file (%lu), %d",
+ inode->i_ino, inode->i_nlink);
+ set_nlink(inode, 1);
}
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
@@ -2169,264 +2834,614 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
retval = 0;
end_unlink:
- ext4_journal_stop(handle);
- brelse (bh);
+ brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ trace_ext4_unlink_exit(dentry, retval);
return retval;
}
-static int ext4_symlink (struct inode * dir,
- struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
handle_t *handle;
- struct inode * inode;
+ struct inode *inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
-retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ dquot_initialize(dir);
- inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ if (l > EXT4_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks, and
+ * possibly selinux xattr blocks.
+ */
+ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT4_XATTR_TRANS_BLOCKS;
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+ }
+retry:
+ inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT4_I(inode)->i_data)) {
+ if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
- * page_symlink() calls into ext4_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext4_write_begin() which can wait
+ * for transaction commit if we are running out of space
+ * and thus we deadlock. So we have to stop transaction now
+ * and restart it when symlink contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
+ */
+ drop_nlink(inode);
+ err = ext4_orphan_add(handle, inode);
+ ext4_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
+ err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+ * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
- err = __page_symlink(inode, symname, l,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ set_nlink(inode, 1);
+ err = ext4_orphan_del(handle, inode);
if (err) {
+ ext4_journal_stop(handle);
clear_nlink(inode);
- ext4_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
+ /* clear the extent format for fast symlink */
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
inode->i_op = &ext4_fast_symlink_inode_operations;
- memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+ memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
-static int ext4_link (struct dentry * old_dentry,
- struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
{
handle_t *handle;
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
- /*
- * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
- * otherwise has the potential to corrupt the orphan inode list.
- */
- if (inode->i_nlink == 0)
- return -ENOENT;
+ dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode->i_ctime = ext4_current_time(inode);
ext4_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
- err = ext4_add_nondir(handle, dentry, inode);
+ err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext4_orphan_del(handle, inode);
+ d_instantiate(dentry, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
-#define PARENT_INO(buffer) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+ struct inode *inode,
+ int *retval,
+ struct ext4_dir_entry_2 **parent_de,
+ int *inlined)
+{
+ struct buffer_head *bh;
+
+ if (!ext4_has_inline_data(inode)) {
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh)) {
+ *retval = PTR_ERR(bh);
+ return NULL;
+ }
+ *parent_de = ext4_next_entry(
+ (struct ext4_dir_entry_2 *)bh->b_data,
+ inode->i_sb->s_blocksize);
+ return bh;
+ }
+
+ *inlined = 1;
+ return ext4_get_first_inline_block(inode, parent_de, retval);
+}
+
+struct ext4_renament {
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool is_dir;
+ int dir_nlink_delta;
+
+ /* entry for "dentry" */
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ int inlined;
+
+ /* entry for ".." in inode if it's a directory */
+ struct buffer_head *dir_bh;
+ struct ext4_dir_entry_2 *parent_de;
+ int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+
+ ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+ &retval, &ent->parent_de,
+ &ent->dir_inlined);
+ if (!ent->dir_bh)
+ return retval;
+ if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+ return -EIO;
+ BUFFER_TRACE(ent->dir_bh, "get_write_access");
+ return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+ unsigned dir_ino)
+{
+ int retval;
+
+ ent->parent_de->inode = cpu_to_le32(dir_ino);
+ BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+ if (!ent->dir_inlined) {
+ if (is_dx(ent->inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ }
+ } else {
+ retval = ext4_mark_inode_dirty(handle, ent->inode);
+ }
+ if (retval) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ int retval;
+
+ BUFFER_TRACE(ent->bh, "get write access");
+ retval = ext4_journal_get_write_access(handle, ent->bh);
+ if (retval)
+ return retval;
+ ent->de->inode = cpu_to_le32(ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ ent->de->file_type = file_type;
+ ent->dir->i_version++;
+ ent->dir->i_ctime = ent->dir->i_mtime =
+ ext4_current_time(ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+ if (!ent->inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->dir, ent->bh);
+ if (unlikely(retval)) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ }
+ brelse(ent->bh);
+ ent->bh = NULL;
+
+ return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+ const struct qstr *d_name)
+{
+ int retval = -ENOENT;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (bh) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ brelse(bh);
+ }
+ return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+ /*
+ * ent->de could have moved from under us during htree split, so make
+ * sure that we are deleting the right entry. We might also be pointing
+ * to a stale entry in the unused part of ent->bh so just checking inum
+ * and the name isn't enough.
+ */
+ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+ ent->de->name_len != ent->dentry->d_name.len ||
+ strncmp(ent->de->name, ent->dentry->d_name.name,
+ ent->de->name_len)) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ } else {
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ if (retval == -ENOENT) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ }
+ }
+
+ if (retval) {
+ ext4_warning(ent->dir->i_sb,
+ "Deleting old file (%lu), %d, error=%d",
+ ent->dir->i_ino, ent->dir->i_nlink, retval);
+ }
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+ if (ent->dir_nlink_delta) {
+ if (ent->dir_nlink_delta == -1)
+ ext4_dec_count(handle, ent->dir);
+ else
+ ext4_inc_count(handle, ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ }
+}
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
+ *
+ * n.b. old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
*/
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
- struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
{
- handle_t *handle;
- struct inode * old_inode, * new_inode;
- struct buffer_head * old_bh, * new_bh, * dir_bh;
- struct ext4_dir_entry_2 * old_de, * new_de;
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
int retval;
- old_bh = new_bh = dir_bh = NULL;
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new_dentry->d_inode)
- DQUOT_INIT(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, 2 *
- EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- handle->h_sync = 1;
+ if (new.inode)
+ dquot_initialize(new.inode);
- old_bh = ext4_find_entry (old_dentry, &old_de);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- old_inode = old_dentry->d_inode;
retval = -ENOENT;
- if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
- new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry (new_dentry, &new_de);
- if (new_bh) {
- if (!new_inode) {
- brelse (new_bh);
- new_bh = NULL;
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+ if (new.bh) {
+ if (!new.inode) {
+ brelse(new.bh);
+ new.bh = NULL;
}
}
- if (S_ISDIR(old_inode->i_mode)) {
- if (new_inode) {
+ if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old.inode);
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir (new_inode))
+ if (!empty_dir(new.inode))
+ goto end_rename;
+ } else {
+ retval = -EMLINK;
+ if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = -EIO;
- dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
- if (!dir_bh)
- goto end_rename;
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
goto end_rename;
}
- if (!new_bh) {
- retval = ext4_add_entry (handle, new_dentry, old_inode);
+ if (!new.bh) {
+ retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
- BUFFER_TRACE(new_bh, "get write access");
- ext4_journal_get_write_access(handle, new_bh);
- new_de->inode = cpu_to_le32(old_inode->i_ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
- new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, new_bh);
- brelse(new_bh);
- new_bh = NULL;
+ retval = ext4_setent(handle, &new,
+ old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
}
/*
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
- ext4_mark_inode_dirty(handle, old_inode);
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
/*
* ok, that's it
*/
- if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
- old_de->name_len != old_dentry->d_name.len ||
- strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
- (retval = ext4_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
- /* old_de could have moved from under us during htree split, so
- * make sure that we are deleting the right entry. We might
- * also be pointing to a stale entry in the unused part of
- * old_bh so just checking inum and the name isn't enough. */
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
-
- old_bh2 = ext4_find_entry(old_dentry, &old_de2);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
- brelse(old_bh2);
- }
+ ext4_rename_delete(handle, &old);
+
+ if (new.inode) {
+ ext4_dec_count(handle, new.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
}
- if (retval) {
- ext4_warning(old_dir->i_sb, "ext4_rename",
- "Deleting old file (%lu), %d, error=%d",
- old_dir->i_ino, old_dir->i_nlink, retval);
- }
-
- if (new_inode) {
- ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
- }
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
- ext4_update_dx_flag(old_dir);
- if (dir_bh) {
- BUFFER_TRACE(dir_bh, "get_write_access");
- ext4_journal_get_write_access(handle, dir_bh);
- PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, dir_bh);
- ext4_dec_count(handle, old_dir);
- if (new_inode) {
+ old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ ext4_update_dx_flag(old.dir);
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+
+ ext4_dec_count(handle, old.dir);
+ if (new.inode) {
/* checked empty_dir above, can't have another parent,
* ext4_dec_count() won't work for many-linked dirs */
- new_inode->i_nlink = 0;
+ clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new_dir);
- ext4_update_dx_flag(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
+ ext4_inc_count(handle, new.dir);
+ ext4_update_dx_flag(new.dir);
+ ext4_mark_inode_dirty(handle, new.dir);
}
}
- ext4_mark_inode_dirty(handle, old_dir);
- if (new_inode) {
- ext4_mark_inode_dirty(handle, new_inode);
- if (!new_inode->i_nlink)
- ext4_orphan_add(handle, new_inode);
+ ext4_mark_inode_dirty(handle, old.dir);
+ if (new.inode) {
+ ext4_mark_inode_dirty(handle, new.inode);
+ if (!new.inode->i_nlink)
+ ext4_orphan_add(handle, new.inode);
}
retval = 0;
end_rename:
- brelse (dir_bh);
- brelse (old_bh);
- brelse (new_bh);
- ext4_journal_stop(handle);
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
+ u8 new_file_type;
+ int retval;
+
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
+
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+ &old.de, &old.inlined);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ retval = -ENOENT;
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+ goto end_rename;
+
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+
+ /* RENAME_EXCHANGE case: old *and* new must both exist */
+ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+ goto end_rename;
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ old.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
+ goto end_rename;
+ }
+ if (S_ISDIR(new.inode->i_mode)) {
+ new.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &new);
+ if (retval)
+ goto end_rename;
+ }
+
+ /*
+ * Other than the special case of overwriting a directory, parents'
+ * nlink only needs to be modified if this is a cross directory rename.
+ */
+ if (old.dir != new.dir && old.is_dir != new.is_dir) {
+ old.dir_nlink_delta = old.is_dir ? -1 : 1;
+ new.dir_nlink_delta = -old.dir_nlink_delta;
+ retval = -EMLINK;
+ if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+ (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+ goto end_rename;
+ }
+
+ new_file_type = new.de->file_type;
+ retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
+
+ retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+ if (retval)
+ goto end_rename;
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
+ ext4_mark_inode_dirty(handle, new.inode);
+
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ if (new.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ ext4_update_dir_count(handle, &old);
+ ext4_update_dir_count(handle, &new);
+ retval = 0;
+
+end_rename:
+ brelse(old.dir_bh);
+ brelse(new.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return ext4_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+ /*
+ * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+ * is equivalent to regular rename.
+ */
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
/*
* directories can handle most operations...
*/
@@ -2439,24 +3454,25 @@ const struct inode_operations ext4_dir_inode_operations = {
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
+ .tmpfile = ext4_tmpfile,
.rename = ext4_rename,
+ .rename2 = ext4_rename2,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .permission = ext4_permission,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
+ .fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
- .permission = ext4_permission,
+ .get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a0..00000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- * Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000000..b24a2541a9b
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,497 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/aio.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct kmem_cache *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+ io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+ if (io_end_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+ kmem_cache_destroy(io_end_cachep);
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+ char b[BDEVNAME_SIZE];
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
+{
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
+ }
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
+
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+ kmem_cache_free(io_end_cachep, io_end);
+}
+
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
+static int ext4_end_io(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ ssize_t size = io->size;
+ handle_t *handle = io->handle;
+ int ret = 0;
+
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ if (ret < 0) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to written "
+ "extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, offset, size, ret);
+ }
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
+ return ret;
+}
+
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(head))
+ return;
+
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle && sbi->s_journal);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ wq = sbi->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
+{
+ ext4_io_end_t *io;
+ struct list_head unwritten;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+ }
+ return ret;
+}
+
+/*
+ * work on completed IO, to convert unwritten extents to extents
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+ ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+ if (io) {
+ atomic_inc(&EXT4_I(inode)->i_ioend_count);
+ io->inode = inode;
+ INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
+ }
+ return io;
+}
+
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
+}
+
+/* BIO completion function for page writeback */
+static void ext4_end_bio(struct bio *bio, int error)
+{
+ ext4_io_end_t *io_end = bio->bi_private;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ BUG_ON(!io_end);
+ bio->bi_end_io = NULL;
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = 0;
+
+ if (error) {
+ struct inode *inode = io_end->inode;
+
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
+ "(offset %llu size %ld starting block %llu)",
+ error, inode->i_ino,
+ (unsigned long long) io_end->offset,
+ (long) io_end->size,
+ (unsigned long long)
+ bi_sector >> (inode->i_blkbits - 9));
+ mapping_set_error(inode->i_mapping, error);
+ }
+
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+ struct bio *bio = io->io_bio;
+
+ if (bio) {
+ bio_get(io->io_bio);
+ submit_bio(io->io_op, io->io_bio);
+ BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+ bio_put(io->io_bio);
+ }
+ io->io_bio = NULL;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
+ io->io_end = NULL;
+}
+
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
+{
+ int nvecs = bio_get_nr_vecs(bh->b_bdev);
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ bio->bi_end_io = ext4_end_bio;
+ bio->bi_private = ext4_get_io_end(io->io_end);
+ io->io_bio = bio;
+ io->io_next_block = bh->b_blocknr;
+ return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ int ret;
+
+ if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+ ext4_io_submit(io);
+ }
+ if (io->io_bio == NULL) {
+ ret = io_submit_init_bio(io, bh);
+ if (ret)
+ return ret;
+ }
+ ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+ if (ret != bh->b_size)
+ goto submit_and_retry;
+ io->io_next_block++;
+ return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+ struct page *page,
+ int len,
+ struct writeback_control *wbc,
+ bool keep_towrite)
+{
+ struct inode *inode = page->mapping->host;
+ unsigned block_start, blocksize;
+ struct buffer_head *bh, *head;
+ int ret = 0;
+ int nr_submitted = 0;
+
+ blocksize = 1 << inode->i_blkbits;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
+ ClearPageError(page);
+
+ /*
+ * Comments copied from block_write_full_page:
+ *
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ if (len < PAGE_CACHE_SIZE)
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ /*
+ * In the first loop we prepare and mark buffers to submit. We have to
+ * mark all buffers in the page before submitting so that
+ * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * on the first buffer finishes and we are still working on submitting
+ * the second buffer.
+ */
+ bh = head = page_buffers(page);
+ do {
+ block_start = bh_offset(bh);
+ if (block_start >= len) {
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ continue;
+ }
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ set_buffer_async_write(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Now submit buffers to write */
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_async_write(bh))
+ continue;
+ ret = io_submit_add_bh(io, inode, bh);
+ if (ret) {
+ /*
+ * We only get here on ENOMEM. Not much else
+ * we can do but mark the page as dirty, and
+ * better luck next time.
+ */
+ redirty_page_for_writepage(wbc, page);
+ break;
+ }
+ nr_submitted++;
+ clear_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+
+ /* Error stopped previous loop? Clean up buffers... */
+ if (ret) {
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+ unlock_page(page);
+ /* Nothing submitted - we have to end page writeback */
+ if (!nr_submitted)
+ end_page_writeback(page);
+ return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e29efa0f9d6..bb0e80f03e2 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -11,12 +11,61 @@
#define EXT4FS_DEBUG
-#include <linux/ext4_jbd2.h>
-
#include <linux/errno.h>
#include <linux/slab.h>
-#include "group.h"
+#include "ext4_jbd2.h"
+
+int ext4_resize_begin(struct super_block *sb)
+{
+ int ret = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ /*
+ * We are not allowed to do online-resizing on a filesystem mounted
+ * with error, because it can destroy the filesystem easily.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_warning(sb, "There are errors in the filesystem, "
+ "so online resizing is not allowed\n");
+ return -EPERM;
+ }
+
+ if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+ ret = -EBUSY;
+
+ return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+ clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+ smp_mb__after_atomic();
+}
+
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+ ext4_group_t group) {
+ return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+ ext4_group_t group) {
+ group = ext4_meta_bg_first_group(sb, group);
+ return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+ ext4_group_t group) {
+ ext4_grpblk_t overhead;
+ overhead = ext4_bg_num_gdb(sb, group);
+ if (ext4_bg_has_super(sb, group))
+ overhead += 1 +
+ le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ return overhead;
+}
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -30,14 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_bg_has_super(sb, group) ?
- (1 + ext4_bg_num_gdb(sb, group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -49,66 +104,52 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, __FUNCTION__,
- "Cannot add at group %u (only %lu groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
- ext4_warning(sb, __FUNCTION__, "Last group not full");
+ if (offset != 0)
+ ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
- ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+ ext4_warning(sb, "Reserved blocks too high (%u)",
input->reserved_blocks);
else if (free_blocks_count < 0)
- ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+ ext4_warning(sb, "Bad blocks count %u",
input->blocks_count);
else if (!(bh = sb_bread(sb, end - 1)))
- ext4_warning(sb, __FUNCTION__,
- "Cannot read last block (%llu)",
+ ext4_warning(sb, "Cannot read last block (%llu)",
end - 1);
else if (outside(input->block_bitmap, start, end))
- ext4_warning(sb, __FUNCTION__,
- "Block bitmap not in group (block %llu)",
+ ext4_warning(sb, "Block bitmap not in group (block %llu)",
(unsigned long long)input->block_bitmap);
else if (outside(input->inode_bitmap, start, end))
- ext4_warning(sb, __FUNCTION__,
- "Inode bitmap not in group (block %llu)",
+ ext4_warning(sb, "Inode bitmap not in group (block %llu)",
(unsigned long long)input->inode_bitmap);
else if (outside(input->inode_table, start, end) ||
- outside(itend - 1, start, end))
- ext4_warning(sb, __FUNCTION__,
- "Inode table not in group (blocks %llu-%llu)",
+ outside(itend - 1, start, end))
+ ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
(unsigned long long)input->inode_table, itend - 1);
else if (input->inode_bitmap == input->block_bitmap)
- ext4_warning(sb, __FUNCTION__,
- "Block bitmap same as inode bitmap (%llu)",
+ ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
(unsigned long long)input->block_bitmap);
else if (inside(input->block_bitmap, input->inode_table, itend))
- ext4_warning(sb, __FUNCTION__,
- "Block bitmap (%llu) in inode table (%llu-%llu)",
+ ext4_warning(sb, "Block bitmap (%llu) in inode table "
+ "(%llu-%llu)",
(unsigned long long)input->block_bitmap,
(unsigned long long)input->inode_table, itend - 1);
else if (inside(input->inode_bitmap, input->inode_table, itend))
- ext4_warning(sb, __FUNCTION__,
- "Inode bitmap (%llu) in inode table (%llu-%llu)",
+ ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+ "(%llu-%llu)",
(unsigned long long)input->inode_bitmap,
(unsigned long long)input->inode_table, itend - 1);
else if (inside(input->block_bitmap, start, metaend))
- ext4_warning(sb, __FUNCTION__,
- "Block bitmap (%llu) in GDT table"
- " (%llu-%llu)",
+ ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
(unsigned long long)input->block_bitmap,
start, metaend - 1);
else if (inside(input->inode_bitmap, start, metaend))
- ext4_warning(sb, __FUNCTION__,
- "Inode bitmap (%llu) in GDT table"
- " (%llu-%llu)",
+ ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
(unsigned long long)input->inode_bitmap,
start, metaend - 1);
else if (inside(input->inode_table, start, metaend) ||
- inside(itend - 1, start, metaend))
- ext4_warning(sb, __FUNCTION__,
- "Inode table (%llu-%llu) overlaps"
- "GDT table (%llu-%llu)",
+ inside(itend - 1, start, metaend))
+ ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+ "(%llu-%llu)",
(unsigned long long)input->inode_table,
itend - 1, start, metaend - 1);
else
@@ -118,6 +159,186 @@ static int verify_group_input(struct super_block *sb,
return err;
}
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+ struct ext4_new_group_data *groups; /* new_group_data for groups
+ in the flex group */
+ __u16 *bg_flags; /* block group flags of groups
+ in @groups */
+ ext4_group_t count; /* number of groups in @groups
+ */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+ struct ext4_new_flex_group_data *flex_gd;
+
+ flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+ if (flex_gd == NULL)
+ goto out3;
+
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ goto out2;
+ flex_gd->count = flexbg_size;
+
+ flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+ flexbg_size, GFP_NOFS);
+ if (flex_gd->groups == NULL)
+ goto out2;
+
+ flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+ if (flex_gd->bg_flags == NULL)
+ goto out1;
+
+ return flex_gd;
+
+out1:
+ kfree(flex_gd->groups);
+out2:
+ kfree(flex_gd);
+out3:
+ return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+ kfree(flex_gd->bg_flags);
+ kfree(flex_gd->groups);
+ kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize. Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
+ */
+static int ext4_alloc_group_tables(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ int flexbg_size)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t start_blk;
+ ext4_fsblk_t last_blk;
+ ext4_group_t src_group;
+ ext4_group_t bb_index = 0;
+ ext4_group_t ib_index = 0;
+ ext4_group_t it_index = 0;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ unsigned overhead;
+ __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+ src_group = group_data[0].group;
+ last_group = src_group + flex_gd->count - 1;
+
+ BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+ (last_group & ~(flexbg_size - 1))));
+next_group:
+ group = group_data[0].group;
+ if (src_group >= group_data[0].group + flex_gd->count)
+ return -ENOSPC;
+ start_blk = ext4_group_first_block_no(sb, src_group);
+ last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+
+ start_blk += overhead;
+
+ /* We collect contiguous blocks as much as possible. */
+ src_group++;
+ for (; src_group <= last_group; src_group++) {
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+ if (overhead == 0)
+ last_blk += group_data[src_group - group].blocks_count;
+ else
+ break;
+ }
+
+ /* Allocate block bitmaps */
+ for (; bb_index < flex_gd->count; bb_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[bb_index].block_bitmap = start_blk++;
+ group = ext4_get_group_number(sb, start_blk - 1);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ }
+
+ /* Allocate inode bitmaps */
+ for (; ib_index < flex_gd->count; ib_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[ib_index].inode_bitmap = start_blk++;
+ group = ext4_get_group_number(sb, start_blk - 1);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ }
+
+ /* Allocate inode tables */
+ for (; it_index < flex_gd->count; it_index++) {
+ unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+ ext4_fsblk_t next_group_start;
+
+ if (start_blk + itb > last_blk)
+ goto next_group;
+ group_data[it_index].inode_table = start_blk;
+ group = ext4_get_group_number(sb, start_blk);
+ next_group_start = ext4_group_first_block_no(sb, group + 1);
+ group -= group_data[0].group;
+
+ if (start_blk + itb > next_group_start) {
+ flex_gd->bg_flags[group + 1] &= uninit_mask;
+ overhead = start_blk + itb - next_group_start;
+ group_data[group + 1].free_blocks_count -= overhead;
+ itb -= overhead;
+ }
+
+ group_data[group].free_blocks_count -= itb;
+ flex_gd->bg_flags[group] &= uninit_mask;
+ start_blk += EXT4_SB(sb)->s_itb_per_group;
+ }
+
+ if (test_opt(sb, DEBUG)) {
+ int i;
+ group = group_data[0].group;
+
+ printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+ "%d groups, flexbg size is %d:\n", flex_gd->count,
+ flexbg_size);
+
+ for (i = 0; i < flex_gd->count; i++) {
+ printk(KERN_DEBUG "adding %s group %u: %u "
+ "blocks (%d free)\n",
+ ext4_bg_has_super(sb, group + i) ? "normal" :
+ "no-super", group + i,
+ group_data[i].blocks_count,
+ group_data[i].free_blocks_count);
+ }
+ }
+ return 0;
+}
+
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
ext4_fsblk_t blk)
{
@@ -125,16 +346,15 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
} else {
- lock_buffer(bh);
memset(bh->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
}
return bh;
@@ -145,168 +365,288 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for block_bitmap modifications.
*/
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
- struct buffer_head *bh)
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
{
int err;
- if (handle->h_buffer_credits >= thresh)
+ if (ext4_handle_has_enough_credits(handle, thresh))
return 0;
err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
if (err < 0)
return err;
if (err) {
- if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t block, ext4_group_t count)
+{
+ ext4_group_t count2;
+
+ ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+ for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_fsblk_t start;
+ struct buffer_head *bh;
+ ext4_group_t group;
+ int err;
+
+ group = ext4_get_group_number(sb, block);
+ start = ext4_group_first_block_no(sb, group);
+ group -= flex_gd->groups[0].group;
+
+ count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+ if (count2 > count)
+ count2 = count;
+
+ if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+ BUG_ON(flex_gd->count > 1);
+ continue;
+ }
+
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ return err;
+
+ bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+ if (unlikely(!bh))
+ return -ENOMEM;
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
return err;
- if ((err = ext4_journal_get_write_access(handle, bh)))
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+ block - start, count2);
+ ext4_set_bits(bh->b_data, block - start, count2);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
return err;
- }
+ brelse(bh);
+ }
return 0;
}
/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
* This doesn't need to be part of the main transaction, since we are only
* changing blocks outside the actual filesystem. We still do journaling to
* ensure the recovery is correct in case of a failure just after resize.
* If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ * 1. copy super block and GDT, and initialize group tables if necessary.
+ * In this step, we only set bits in blocks bitmaps for blocks taken by
+ * super block and GDT.
+ * 2. allocate group tables in block bitmaps, that is, set bits in block
+ * bitmap for blocks taken by group tables.
*/
-static int setup_new_group_blocks(struct super_block *sb,
- struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
{
+ int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+ ext4_fsblk_t start;
+ ext4_fsblk_t block;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
- int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
- unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
- struct buffer_head *bh;
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ __u16 *bg_flags = flex_gd->bg_flags;
handle_t *handle;
- ext4_fsblk_t block;
- ext4_grpblk_t bit;
- int i;
- int err = 0, err2;
+ ext4_group_t group, count;
+ struct buffer_head *bh = NULL;
+ int reserved_gdb, i, j, err = 0, err2;
+ int meta_bg;
- /* This transaction may be extended/restarted along the way */
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ BUG_ON(!flex_gd->count || !group_data ||
+ group_data[0].group != sbi->s_groups_count);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ /* This transaction may be extended/restarted along the way */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle))
return PTR_ERR(handle);
- lock_super(sb);
- if (input->group != sbi->s_groups_count) {
- err = -EBUSY;
- goto exit_journal;
- }
-
- if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
-
- if (ext4_bg_has_super(sb, input->group)) {
- ext4_debug("mark backup superblock %#04llx (+0)\n", start);
- ext4_set_bit(0, bh->b_data);
- }
+ group = group_data[0].group;
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ unsigned long gdblocks;
+ ext4_grpblk_t overhead;
- /* Copy all of the GDT blocks into the backup in this group */
- for (i = 0, bit = 1, block = start + 1;
- i < gdblocks; i++, block++, bit++) {
- struct buffer_head *gdb;
+ gdblocks = ext4_bg_num_gdb(sb, group);
+ start = ext4_group_first_block_no(sb, group);
- ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+ if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+ goto handle_itb;
- if ((err = extend_or_restart_transaction(handle, 1, bh)))
- goto exit_bh;
-
- gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
- goto exit_bh;
+ if (meta_bg == 1) {
+ ext4_group_t first_group;
+ first_group = ext4_meta_bg_first_group(sb, group);
+ if (first_group != group + 1 &&
+ first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+ goto handle_itb;
}
- if ((err = ext4_journal_get_write_access(handle, gdb))) {
+
+ block = start + ext4_bg_has_super(sb, group);
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (j = 0; j < gdblocks; j++, block++) {
+ struct buffer_head *gdb;
+
+ ext4_debug("update backup group %#04llx\n", block);
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ gdb = sb_getblk(sb, block);
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ BUFFER_TRACE(gdb, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto out;
+ }
+ memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+ gdb->b_size);
+ set_buffer_uptodate(gdb);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+ if (unlikely(err)) {
+ brelse(gdb);
+ goto out;
+ }
brelse(gdb);
- goto exit_bh;
}
- lock_buffer(gdb);
- memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
- set_buffer_uptodate(gdb);
- unlock_buffer(gdb);
- ext4_journal_dirty_metadata(handle, gdb);
- ext4_set_bit(bit, bh->b_data);
- brelse(gdb);
- }
- /* Zero out all of the reserved backup group descriptor table blocks */
- for (i = 0, bit = gdblocks + 1, block = start + bit;
- i < reserved_gdb; i++, block++, bit++) {
- struct buffer_head *gdb;
-
- ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-
- if ((err = extend_or_restart_transaction(handle, 1, bh)))
- goto exit_bh;
+ /* Zero out all of the reserved backup group descriptor
+ * table blocks
+ */
+ if (ext4_bg_has_super(sb, group)) {
+ err = sb_issue_zeroout(sb, gdblocks + start + 1,
+ reserved_gdb, GFP_NOFS);
+ if (err)
+ goto out;
+ }
- if (IS_ERR(gdb = bclean(handle, sb, block))) {
+handle_itb:
+ /* Initialize group tables of the grop @group */
+ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+ goto handle_bb;
+
+ /* Zero out all of the inode table blocks */
+ block = group_data[i].inode_table;
+ ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+ block, sbi->s_itb_per_group);
+ err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+ GFP_NOFS);
+ if (err)
+ goto out;
+
+handle_bb:
+ if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+ goto handle_ib;
+
+ /* Initialize block bitmap of the @group */
+ block = group_data[i].block_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
err = PTR_ERR(bh);
- goto exit_bh;
+ goto out;
}
- ext4_journal_dirty_metadata(handle, gdb);
- ext4_set_bit(bit, bh->b_data);
- brelse(gdb);
- }
- ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
- input->block_bitmap - start);
- ext4_set_bit(input->block_bitmap - start, bh->b_data);
- ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
- input->inode_bitmap - start);
- ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
- /* Zero out all of the inode table blocks */
- for (i = 0, block = input->inode_table, bit = block - start;
- i < sbi->s_itb_per_group; i++, bit++, block++) {
- struct buffer_head *it;
-
- ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-
- if ((err = extend_or_restart_transaction(handle, 1, bh)))
- goto exit_bh;
-
- if (IS_ERR(it = bclean(handle, sb, block))) {
- err = PTR_ERR(it);
- goto exit_bh;
+ overhead = ext4_group_overhead_blocks(sb, group);
+ if (overhead != 0) {
+ ext4_debug("mark backup superblock %#04llx (+0)\n",
+ start);
+ ext4_set_bits(bh->b_data, 0, overhead);
}
- ext4_journal_dirty_metadata(handle, it);
- brelse(it);
- ext4_set_bit(bit, bh->b_data);
- }
+ ext4_mark_bitmap_end(group_data[i].blocks_count,
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
- if ((err = extend_or_restart_transaction(handle, 2, bh)))
- goto exit_bh;
+handle_ib:
+ if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+ continue;
+
+ /* Initialize inode bitmap of the @group */
+ block = group_data[i].inode_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+ /* Mark unused entries in inode bitmap used */
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
- mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
- bh->b_data);
- ext4_journal_dirty_metadata(handle, bh);
- brelse(bh);
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+ }
+ bh = NULL;
+
+ /* Mark group tables in block bitmap */
+ for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+ count = group_table_count[j];
+ start = (&group_data[0].block_bitmap)[j];
+ block = start;
+ for (i = 1; i < flex_gd->count; i++) {
+ block += group_table_count[j];
+ if (block == (&group_data[i].block_bitmap)[j]) {
+ count += group_table_count[j];
+ continue;
+ }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ count = group_table_count[j];
+ start = (&group_data[i].block_bitmap)[j];
+ block = start;
+ }
- /* Mark unused entries in inode bitmap used */
- ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
- input->inode_bitmap, input->inode_bitmap - start);
- if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
+ if (count) {
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ }
}
- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
- bh->b_data);
- ext4_journal_dirty_metadata(handle, bh);
-exit_bh:
+out:
brelse(bh);
-
-exit_journal:
- unlock_super(sb);
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
return err;
@@ -354,10 +694,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
* groups in current filesystem that have BACKUPS, or -ve error code.
*/
static int verify_reserved_gdb(struct super_block *sb,
+ ext4_group_t end,
struct buffer_head *primary)
{
const ext4_fsblk_t blk = primary->b_blocknr;
- const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
@@ -368,8 +708,7 @@ static int verify_reserved_gdb(struct super_block *sb,
while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
if (le32_to_cpu(*p++) !=
grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
- ext4_warning(sb, __FUNCTION__,
- "reserved GDT %llu"
+ ext4_warning(sb, "reserved GDT %llu"
" missing grp %d (%llu)",
blk, grp,
grp *
@@ -398,15 +737,15 @@ static int verify_reserved_gdb(struct super_block *sb,
* fail once we start modifying the data on disk, because JBD has no rollback.
*/
static int add_new_gdb(handle_t *handle, struct inode *inode,
- struct ext4_new_group_data *input,
- struct buffer_head **primary)
+ ext4_group_t group)
{
struct super_block *sb = inode->i_sb;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
struct buffer_head **o_group_desc, **n_group_desc;
struct buffer_head *dind;
+ struct buffer_head *gdb_bh;
int gdbackups;
struct ext4_iloc iloc;
__le32 *data;
@@ -419,22 +758,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
/*
* If we are not using the primary superblock/GDT copy don't resize,
- * because the user tools have no way of handling this. Probably a
- * bad time to do it anyways.
- */
+ * because the user tools have no way of handling this. Probably a
+ * bad time to do it anyways.
+ */
if (EXT4_SB(sb)->s_sbh->b_blocknr !=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
- ext4_warning(sb, __FUNCTION__,
- "won't resize using backup superblock at %llu",
+ ext4_warning(sb, "won't resize using backup superblock at %llu",
(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
return -EPERM;
}
- *primary = sb_bread(sb, gdblock);
- if (!*primary)
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
return -EIO;
- if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
+ if (gdbackups < 0) {
err = gdbackups;
goto exit_bh;
}
@@ -448,32 +787,39 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
data = (__le32 *)dind->b_data;
if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
- ext4_warning(sb, __FUNCTION__,
- "new group %u GDT block %llu not reserved",
- input->group, gdblock);
+ ext4_warning(sb, "new group %u GDT block %llu not reserved",
+ group, gdblock);
err = -EINVAL;
goto exit_dind;
}
- if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (unlikely(err))
goto exit_dind;
- if ((err = ext4_journal_get_write_access(handle, *primary)))
- goto exit_sbh;
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ goto exit_dind;
- if ((err = ext4_journal_get_write_access(handle, dind)))
- goto exit_primary;
+ BUFFER_TRACE(dind, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dind);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
/* ext4_reserve_inode_write() gets a reference on the iloc */
- if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
- goto exit_dindj;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (unlikely(err))
+ goto exit_dind;
- n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
- GFP_KERNEL);
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
if (!n_group_desc) {
err = -ENOMEM;
- ext4_warning (sb, __FUNCTION__,
- "not enough memory for %lu groups", gdb_num + 1);
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
goto exit_inode;
}
@@ -487,46 +833,89 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
* reserved inode, and will become GDT blocks (primary and backup).
*/
data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
- ext4_journal_dirty_metadata(handle, dind);
- brelse(dind);
+ err = ext4_handle_dirty_metadata(handle, NULL, dind);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
ext4_mark_iloc_dirty(handle, inode, &iloc);
- memset((*primary)->b_data, 0, sb->s_blocksize);
- ext4_journal_dirty_metadata(handle, *primary);
+ memset(gdb_bh->b_data, 0, sb->s_blocksize);
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
+ brelse(dind);
o_group_desc = EXT4_SB(sb)->s_group_desc;
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
- n_group_desc[gdb_num] = *primary;
+ n_group_desc[gdb_num] = gdb_bh;
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
- kfree(o_group_desc);
+ ext4_kvfree(o_group_desc);
- es->s_reserved_gdt_blocks =
- cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err)
+ ext4_std_error(sb, err);
- return 0;
+ return err;
exit_inode:
- //ext4_journal_release_buffer(handle, iloc.bh);
+ ext4_kvfree(n_group_desc);
brelse(iloc.bh);
-exit_dindj:
- //ext4_journal_release_buffer(handle, dind);
-exit_primary:
- //ext4_journal_release_buffer(handle, *primary);
-exit_sbh:
- //ext4_journal_release_buffer(handle, *primary);
exit_dind:
brelse(dind);
exit_bh:
- brelse(*primary);
+ brelse(gdb_bh);
ext4_debug("leaving with error %d\n", err);
return err;
}
/*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+ handle_t *handle, ext4_group_t group) {
+ ext4_fsblk_t gdblock;
+ struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int err;
+
+ gdblock = ext4_meta_bg_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
+ return -EIO;
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
+ return err;
+ }
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = gdb_bh;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ ext4_kvfree(o_group_desc);
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ brelse(gdb_bh);
+ return err;
+}
+
+/*
* Called when we are adding a new group which has a backup copy of each of
* the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
* We need to add these reserved backup GDT blocks to the resize inode, so
@@ -540,7 +929,7 @@ exit_bh:
* backup GDT blocks are stored in their reserved primary GDT block.
*/
static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
- struct ext4_new_group_data *input)
+ ext4_group_t group)
{
struct super_block *sb = inode->i_sb;
int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -553,7 +942,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
int res, i;
int err;
- primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+ primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
if (!primary)
return -ENOMEM;
@@ -565,14 +954,14 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
- data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+ data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+ EXT4_ADDR_PER_BLOCK(sb));
end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
/* Get each reserved primary GDT block and verify it holds backups */
for (res = 0; res < reserved_gdb; res++, blk++) {
if (le32_to_cpu(*data) != blk) {
- ext4_warning(sb, __FUNCTION__,
- "reserved block %llu"
+ ext4_warning(sb, "reserved block %llu"
" not at offset %ld",
blk,
(long)(data - (__le32 *)dind->b_data));
@@ -584,7 +973,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
err = -EIO;
goto exit_bh;
}
- if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+ if (gdbackups < 0) {
brelse(primary[res]);
err = gdbackups;
goto exit_bh;
@@ -594,14 +984,9 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
- if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
- /*
- int j;
- for (j = 0; j < i; j++)
- ext4_journal_release_buffer(handle, primary[j]);
- */
+ BUFFER_TRACE(primary[i], "get_write_access");
+ if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
- }
}
if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
@@ -611,7 +996,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
* Finally we can add each of the reserved backup GDT blocks from
* the new group to its reserved primary GDT block.
*/
- blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+ blk = group * EXT4_BLOCKS_PER_GROUP(sb);
for (i = 0; i < reserved_gdb; i++) {
int err2;
data = (__le32 *)primary[i]->b_data;
@@ -619,7 +1004,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
primary[i]->b_blocknr, gdbackups,
blk + primary[i]->b_blocknr); */
data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
- err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+ err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
if (!err)
err = err2;
}
@@ -646,49 +1031,68 @@ exit_free:
* important part is that the new block and inode counts are in the backup
* superblocks, and the location of the new group metadata in the GDT backups.
*
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted. We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time. The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted. We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time. The resize
+ * which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb,
- int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+ int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const ext4_group_t last = sbi->s_groups_count;
+ ext4_group_t last;
const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
- ext4_group_t group;
+ ext4_group_t group = 0;
int rest = sb->s_blocksize - size;
handle_t *handle;
int err = 0, err2;
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle)) {
group = 1;
err = PTR_ERR(handle);
goto exit_err;
}
- while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ if (meta_bg == 0) {
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ last = sbi->s_groups_count;
+ } else {
+ group = ext4_meta_bg_first_group(sb, group) + 1;
+ last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+ }
+
+ while (group < sbi->s_groups_count) {
struct buffer_head *bh;
+ ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
- if (handle->h_buffer_credits == 0 &&
+ if (ext4_handle_valid(handle) &&
+ handle->h_buffer_credits == 0 &&
ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
(err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
break;
- bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (meta_bg == 0)
+ backup_block = group * bpg + blk_off;
+ else
+ backup_block = (ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group));
+
+ bh = sb_getblk(sb, backup_block);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
- ext4_debug("update metadata backup %#04lx\n",
- (unsigned long)bh->b_blocknr);
+ ext4_debug("update metadata backup %llu(+%llu)\n",
+ backup_block, backup_block -
+ ext4_group_first_block_no(sb, group));
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -697,8 +1101,17 @@ static void update_backups(struct super_block *sb,
memset(bh->b_data + size, 0, rest);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
brelse(bh);
+
+ if (meta_bg == 0)
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ else if (group == last)
+ break;
+ else
+ group = last;
}
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
@@ -715,8 +1128,7 @@ static void update_backups(struct super_block *sb,
*/
exit_err:
if (err) {
- ext4_warning(sb, __FUNCTION__,
- "can't update backup for group %lu (err %d), "
+ ext4_warning(sb, "can't update backup for group %u (err %d), "
"forcing fsck on next reboot", group, err);
sbi->s_mount_state &= ~EXT4_VALID_FS;
sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -724,147 +1136,184 @@ exit_err:
}
}
-/* Add group descriptor data to an existing or new group descriptor block.
- * Ensure we handle all possible error conditions _before_ we start modifying
- * the filesystem, because we cannot abort the transaction and not have it
- * write the data to disk.
- *
- * If we are on a GDT block boundary, we need to get the reserved GDT block.
- * Otherwise, we may need to add backup GDT blocks for a sparse group.
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
*
- * We only need to hold the superblock lock while we are actually adding
- * in the new group's counts to the superblock. Prior to that we have
- * not really "added" the group at all. We re-check that we are still
- * adding in the last group in case things have changed since verifying.
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
*/
-int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+ ext4_group_t group, struct inode *resize_inode,
+ ext4_group_t count)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
- le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
- struct buffer_head *primary = NULL;
- struct ext4_group_desc *gdp;
- struct inode *inode = NULL;
- handle_t *handle;
- int gdb_off, gdb_num;
- int err, err2;
-
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
- gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
-
- if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
- ext4_warning(sb, __FUNCTION__,
- "Can't resize non-sparse filesystem further");
- return -EPERM;
- }
-
- if (ext4_blocks_count(es) + input->blocks_count <
- ext4_blocks_count(es)) {
- ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
- return -EINVAL;
- }
-
- if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
- le32_to_cpu(es->s_inodes_count)) {
- ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
- return -EINVAL;
+ struct buffer_head *gdb_bh;
+ int i, gdb_off, gdb_num, err = 0;
+ int meta_bg;
+
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ for (i = 0; i < count; i++, group++) {
+ int reserved_gdb = ext4_bg_has_super(sb, group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+
+ if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+ err = reserve_backup_gdb(handle, resize_inode, group);
+ } else if (meta_bg != 0) {
+ err = add_new_gdb_meta_bg(sb, handle, group);
+ } else {
+ err = add_new_gdb(handle, resize_inode, group);
+ }
+ if (err)
+ break;
}
+ return err;
+}
- if (reserved_gdb || gdb_off == 0) {
- if (!EXT4_HAS_COMPAT_FEATURE(sb,
- EXT4_FEATURE_COMPAT_RESIZE_INODE)){
- ext4_warning(sb, __FUNCTION__,
- "No reserved GDT blocks, can't resize");
- return -EPERM;
- }
- inode = ext4_iget(sb, EXT4_RESIZE_INO);
- if (IS_ERR(inode)) {
- ext4_warning(sb, __FUNCTION__,
- "Error opening resize inode");
- return PTR_ERR(inode);
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+ struct buffer_head *bh = sb_getblk(sb, block);
+ if (unlikely(!bh))
+ return NULL;
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ brelse(bh);
+ return NULL;
}
}
- if ((err = verify_group_input(sb, input)))
- goto exit_put;
+ return bh;
+}
- if ((err = setup_new_group_blocks(sb, input)))
- goto exit_put;
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+ ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct ext4_new_group_data *group_data)
+{
+ struct buffer_head *bh;
- /*
- * We will always be modifying at least the superblock and a GDT
- * block. If we are adding a group past the last current GDT block,
- * we will also modify the inode and the dindirect block. If we
- * are adding a group with superblock/GDT backups we will also
- * modify each of the reserved GDT dindirect blocks.
- */
- handle = ext4_journal_start_sb(sb,
- ext4_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto exit_put;
- }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
- lock_super(sb);
- if (input->group != sbi->s_groups_count) {
- ext4_warning(sb, __FUNCTION__,
- "multiple resizers run on filesystem!");
- err = -EBUSY;
- goto exit_journal;
- }
+ bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ brelse(bh);
- if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
- goto exit_journal;
+ bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+ if (!bh)
+ return -EIO;
+ ext4_block_bitmap_csum_set(sb, group, gdp, bh);
+ brelse(bh);
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
- if (gdb_off) {
- primary = sbi->s_group_desc[gdb_num];
- if ((err = ext4_journal_get_write_access(handle, primary)))
- goto exit_journal;
-
- if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
- (err = reserve_backup_gdb(handle, inode, input)))
- goto exit_journal;
- } else if ((err = add_new_gdb(handle, inode, input, &primary)))
- goto exit_journal;
+ return 0;
+}
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * Current kernels don't lock all allocations via lock_super(),
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *gdb_bh;
+ ext4_group_t group;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ int i, gdb_off, gdb_num, err = 0;
+
+
+ for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+ group = group_data->group;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+ */
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
+
+ memset(gdp, 0, EXT4_DESC_SIZE(sb));
+ ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+ ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+ if (err) {
+ ext4_std_error(sb, err);
+ break;
+ }
- /* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+ ext4_inode_table_set(sb, gdp, group_data->inode_table);
+ ext4_free_group_clusters_set(sb, gdp,
+ EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ if (ext4_has_group_desc_csum(sb))
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags = cpu_to_le16(*bg_flags);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ break;
+ }
- ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
- ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
- ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
- gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
- gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
+ /*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_groupinfo(sb, group, gdp);
+ if (err)
+ break;
+ }
+ return err;
+}
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ ext4_fsblk_t blocks_count = 0;
+ ext4_fsblk_t free_blocks = 0;
+ ext4_fsblk_t reserved_blocks = 0;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
/*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
@@ -875,26 +1324,35 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* blocks/inodes before the group is live won't actually let us
* allocate the new space yet.
*/
- ext4_blocks_count_set(es, ext4_blocks_count(es) +
- input->blocks_count);
- es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
- EXT4_INODES_PER_GROUP(sb));
+ for (i = 0; i < flex_gd->count; i++) {
+ blocks_count += group_data[i].blocks_count;
+ free_blocks += group_data[i].free_blocks_count;
+ }
+ reserved_blocks = ext4_r_blocks_count(es) * 100;
+ reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks *= blocks_count;
+ do_div(reserved_blocks, 100);
+
+ ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+ le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+ le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+
+ ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
/*
* We need to protect s_groups_count against other CPUs seeing
* inconsistent state in the superblock.
*
* The precise rules we use are:
*
- * * Writers of s_groups_count *must* hold lock_super
- * AND
- * * Writers must perform a smp_wmb() after updating all dependent
- * data and before modifying the groups count
+ * * Writers must perform a smp_wmb() after updating all
+ * dependent data and before modifying the groups count
*
- * * Readers must hold lock_super() over the access
- * OR
- * * Readers must perform an smp_rmb() after reading the groups count
- * and before reading any dependent data.
+ * * Readers must perform an smp_rmb() after reading the groups
+ * count and before reading any dependent data.
*
* NB. These rules can be relaxed when checking the group count
* while freeing data, as we can only allocate from a block
@@ -905,40 +1363,334 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
smp_wmb();
/* Update the global fs size fields */
- sbi->s_groups_count++;
-
- ext4_journal_dirty_metadata(handle, primary);
+ sbi->s_groups_count += flex_gd->count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
* active. */
ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
- input->reserved_blocks);
+ reserved_blocks);
/* Update the free space counts */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- input->free_blocks_count);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, free_blocks));
percpu_counter_add(&sbi->s_freeinodes_counter,
- EXT4_INODES_PER_GROUP(sb));
+ EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+ ext4_debug("free blocks count %llu",
+ percpu_counter_read(&sbi->s_freeclusters_counter));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, group_data[0].group);
+ atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ }
- ext4_journal_dirty_metadata(handle, sbi->s_sbh);
- sb->s_dirt = 1;
+ /*
+ * Update the fs overhead information
+ */
+ ext4_calculate_overhead(sb);
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+ "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+ blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+ struct inode *resize_inode,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_group_t group;
+ handle_t *handle;
+ unsigned reserved_gdb;
+ int err = 0, err2 = 0, credit;
+
+ BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ o_blocks_count = ext4_blocks_count(es);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+
+ err = setup_new_flex_group_blocks(sb, flex_gd);
+ if (err)
+ goto exit;
+ /*
+ * We will always be modifying at least the superblock and GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ credit = flex_gd->count * 4 + reserved_gdb;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto exit_journal;
+
+ group = flex_gd->groups[0].group;
+ BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+ err = ext4_add_new_descs(handle, sb, group,
+ resize_inode, flex_gd->count);
+ if (err)
+ goto exit_journal;
+
+ err = ext4_setup_new_descs(handle, sb, flex_gd);
+ if (err)
+ goto exit_journal;
+
+ ext4_update_super(sb, flex_gd);
+
+ err = ext4_handle_dirty_super(handle, sb);
exit_journal:
- unlock_super(sb);
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ err2 = ext4_journal_stop(handle);
+ if (!err)
err = err2;
+
if (!err) {
+ int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int gdb_num_end = ((group + flex_gd->count - 1) /
+ EXT4_DESC_PER_BLOCK(sb));
+ int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG);
+ sector_t old_gdb = 0;
+
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
- update_backups(sb, primary->b_blocknr, primary->b_data,
- primary->b_size);
+ sizeof(struct ext4_super_block), 0);
+ for (; gdb_num <= gdb_num_end; gdb_num++) {
+ struct buffer_head *gdb_bh;
+
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ if (old_gdb == gdb_bh->b_blocknr)
+ continue;
+ update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+ gdb_bh->b_size, meta_bg);
+ old_gdb = gdb_bh->b_blocknr;
+ }
}
-exit_put:
+exit:
+ return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t n_blocks_count,
+ unsigned long flexbg_size)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t n_group;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ ext4_grpblk_t last;
+ ext4_grpblk_t blocks_per_group;
+ unsigned long i;
+
+ blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (o_blocks_count == n_blocks_count)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+ last_group = group | (flexbg_size - 1);
+ if (last_group > n_group)
+ last_group = n_group;
+
+ flex_gd->count = last_group - group + 1;
+
+ for (i = 0; i < flex_gd->count; i++) {
+ int overhead;
+
+ group_data[i].group = group + i;
+ group_data[i].blocks_count = blocks_per_group;
+ overhead = ext4_group_overhead_blocks(sb, group + i);
+ group_data[i].free_blocks_count = blocks_per_group - overhead;
+ if (ext4_has_group_desc_csum(sb)) {
+ flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+ EXT4_BG_INODE_UNINIT;
+ if (!test_opt(sb, INIT_INODE_TABLE))
+ flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+ } else
+ flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+ }
+
+ if (last_group == n_group && ext4_has_group_desc_csum(sb))
+ /* We need to initialize block bitmap of last group. */
+ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+ group_data[i - 1].blocks_count = last + 1;
+ group_data[i - 1].free_blocks_count -= blocks_per_group-
+ last - 1;
+ }
+
+ return 1;
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock. Prior to that we have
+ * not really "added" the group at all. We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+ struct ext4_new_flex_group_data flex_gd;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+ struct inode *inode = NULL;
+ int gdb_off;
+ int err;
+ __u16 bg_flags = 0;
+
+ gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+ if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ ext4_warning(sb, "Can't resize non-sparse filesystem further");
+ return -EPERM;
+ }
+
+ if (ext4_blocks_count(es) + input->blocks_count <
+ ext4_blocks_count(es)) {
+ ext4_warning(sb, "blocks_count overflow");
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_warning(sb, "inodes_count overflow");
+ return -EINVAL;
+ }
+
+ if (reserved_gdb || gdb_off == 0) {
+ if (!EXT4_HAS_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_COMPAT_RESIZE_INODE)
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ ext4_warning(sb,
+ "No reserved GDT blocks, can't resize");
+ return -EPERM;
+ }
+ inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(inode);
+ }
+ }
+
+
+ err = verify_group_input(sb, input);
+ if (err)
+ goto out;
+
+ err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+ if (err)
+ goto out;
+
+ err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+ if (err)
+ goto out;
+
+ flex_gd.count = 1;
+ flex_gd.groups = input;
+ flex_gd.bg_flags = &bg_flags;
+ err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
iput(inode);
return err;
} /* ext4_group_add */
-/* Extend the filesystem to the new number of blocks specified. This entry
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+ ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ handle_t *handle;
+ int err = 0, err2;
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_group_add_blocks().
+ */
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ ext4_warning(sb, "error %d on journal start", err);
+ return err;
+ }
+
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) {
+ ext4_warning(sb, "error %d on journal write access", err);
+ goto errout;
+ }
+
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+ ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+ if (err)
+ goto errout;
+ ext4_handle_dirty_super(handle, sb);
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+errout:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+
+ if (!err) {
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+ "blocks\n", ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ (char *)es, sizeof(struct ext4_super_block), 0);
+ }
+ return err;
+}
+
+/*
+ * Extend the filesystem to the new number of blocks specified. This entry
* point is only used to extend the current filesystem to the end of the last
* existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
* for emergencies (because it has no dependencies on reserved blocks).
@@ -951,56 +1703,48 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count)
{
ext4_fsblk_t o_blocks_count;
- ext4_group_t o_groups_count;
ext4_grpblk_t last;
ext4_grpblk_t add;
- struct buffer_head * bh;
- handle_t *handle;
+ struct buffer_head *bh;
int err;
- unsigned long freed_blocks;
+ ext4_group_t group;
- /* We don't need to worry about locking wrt other resizers just
- * yet: we're going to revalidate es->s_blocks_count after
- * taking lock_super() below. */
o_blocks_count = ext4_blocks_count(es);
- o_groups_count = EXT4_SB(sb)->s_groups_count;
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
- o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_DEBUG,
+ "extending last group from %llu to %llu blocks",
+ o_blocks_count, n_blocks_count);
if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
return 0;
if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT4-fs: filesystem on %s:"
- " too large to resize to %llu blocks safely\n",
- sb->s_id, n_blocks_count);
+ ext4_msg(sb, KERN_ERR,
+ "filesystem too large to resize to %llu blocks safely",
+ n_blocks_count);
if (sizeof(sector_t) < 8)
- ext4_warning(sb, __FUNCTION__,
- "CONFIG_LBD not enabled\n");
+ ext4_warning(sb, "CONFIG_LBDAF not enabled");
return -EINVAL;
}
if (n_blocks_count < o_blocks_count) {
- ext4_warning(sb, __FUNCTION__,
- "can't shrink FS - resize aborted");
- return -EBUSY;
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
- ext4_warning(sb, __FUNCTION__,
- "need to use ext2online to resize further");
+ ext4_warning(sb, "need to use ext2online to resize further");
return -EPERM;
}
add = EXT4_BLOCKS_PER_GROUP(sb) - last;
if (o_blocks_count + add < o_blocks_count) {
- ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+ ext4_warning(sb, "blocks_count overflow");
return -EINVAL;
}
@@ -1008,64 +1752,269 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
add = n_blocks_count - o_blocks_count;
if (o_blocks_count + add < n_blocks_count)
- ext4_warning(sb, __FUNCTION__,
- "will only finish group (%llu"
- " blocks, %u new)",
+ ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add -1);
+ bh = sb_bread(sb, o_blocks_count + add - 1);
if (!bh) {
- ext4_warning(sb, __FUNCTION__,
- "can't read last block, resize aborted");
+ ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
brelse(bh);
- /* We will update the superblock, one block bitmap, and
- * one group descriptor via ext4_free_blocks().
- */
- handle = ext4_journal_start_sb(sb, 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
- goto exit_put;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ return err;
+} /* ext4_group_extend */
+
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+ return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t nr;
+ int i, ret, err = 0;
+ int credits = 1;
+
+ ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+ if (inode) {
+ if (es->s_reserved_gdt_blocks) {
+ ext4_error(sb, "Unexpected non-zero "
+ "s_reserved_gdt_blocks");
+ return -EPERM;
+ }
+
+ /* Do a quick sanity check of the resize inode */
+ if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ goto invalid_resize_inode;
+ for (i = 0; i < EXT4_N_BLOCKS; i++) {
+ if (i == EXT4_DIND_BLOCK) {
+ if (ei->i_data[i])
+ continue;
+ else
+ goto invalid_resize_inode;
+ }
+ if (ei->i_data[i])
+ goto invalid_resize_inode;
+ }
+ credits += 3; /* block bitmap, bg descriptor, resize inode */
}
- lock_super(sb);
- if (o_blocks_count != ext4_blocks_count(es)) {
- ext4_warning(sb, __FUNCTION__,
- "multiple resizers run on filesystem!");
- unlock_super(sb);
- ext4_journal_stop(handle);
- err = -EBUSY;
- goto exit_put;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto errout;
+
+ EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ sbi->s_es->s_first_meta_bg =
+ cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto errout;
}
- if ((err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh))) {
- ext4_warning(sb, __FUNCTION__,
- "error %d on journal write access", err);
- unlock_super(sb);
- ext4_journal_stop(handle);
- goto exit_put;
+ if (inode) {
+ nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ ei->i_data[EXT4_DIND_BLOCK] = 0;
+ inode->i_blocks = 0;
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ ext4_std_error(sb, err);
}
- ext4_blocks_count_set(es, o_blocks_count + add);
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
- sb->s_dirt = 1;
- unlock_super(sb);
- ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
- ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- if ((err = ext4_journal_stop(handle)))
- goto exit_put;
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
- ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
-exit_put:
+
+errout:
+ ret = ext4_journal_stop(handle);
+ if (!err)
+ err = ret;
+ return ret;
+
+invalid_resize_inode:
+ ext4_error(sb, "corrupted/inconsistent resize inode");
+ return -EINVAL;
+}
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+ struct ext4_new_flex_group_data *flex_gd = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *bh;
+ struct inode *resize_inode = NULL;
+ ext4_grpblk_t add, offset;
+ unsigned long n_desc_blocks;
+ unsigned long o_desc_blocks;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_fsblk_t o_blocks_count;
+ ext4_fsblk_t n_blocks_count_retry = 0;
+ unsigned long last_update_time = 0;
+ int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int meta_bg;
+
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+retry:
+ o_blocks_count = ext4_blocks_count(es);
+
+ ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count < o_blocks_count) {
+ /* On-line shrinking not supported */
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
+ }
+
+ if (n_blocks_count == o_blocks_count)
+ /* Nothing need to do */
+ return 0;
+
+ n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+ if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ ext4_warning(sb, "resize would cause inodes_count overflow");
+ return -EINVAL;
+ }
+ ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+ n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+ o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
+
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (meta_bg) {
+ ext4_error(sb, "resize_inode and meta_bg enabled "
+ "simultaneously");
+ return -EINVAL;
+ }
+ if (n_desc_blocks > o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ n_blocks_count_retry = n_blocks_count;
+ n_desc_blocks = o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+ n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+ n_group--; /* set to last group number */
+ }
+
+ if (!resize_inode)
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
+ }
+
+ if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ err = ext4_convert_meta_bg(sb, resize_inode);
+ if (err)
+ goto out;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
+ if (n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ goto retry;
+ }
+ }
+
+ /* extend the last group */
+ if (n_group == o_group)
+ add = n_blocks_count - o_blocks_count;
+ else
+ add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ if (add > 0) {
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ if (err)
+ goto out;
+ }
+
+ if (ext4_blocks_count(es) == n_blocks_count)
+ goto out;
+
+ err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+ if (err)
+ goto out;
+
+ flex_gd = alloc_flex_gd(flexbg_size);
+ if (flex_gd == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Add flex groups. Note that a regular group is a
+ * flex group with 1 group.
+ */
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+ flexbg_size)) {
+ if (jiffies - last_update_time > HZ * 10) {
+ if (last_update_time)
+ ext4_msg(sb, KERN_INFO,
+ "resized to %llu blocks",
+ ext4_blocks_count(es));
+ last_update_time = jiffies;
+ }
+ if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+ break;
+ err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+ if (unlikely(err))
+ break;
+ }
+
+ if (!err && n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ free_flex_gd(flex_gd);
+ flex_gd = NULL;
+ goto retry;
+ }
+
+out:
+ if (flex_gd)
+ free_flex_gd(flex_gd);
+ if (resize_inode != NULL)
+ iput(resize_inode);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 13383ba18f1..6df7bc611db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,14 +20,12 @@
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
+#include <linux/vmalloc.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/parser.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
@@ -36,43 +34,166 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
-
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include "ext4.h"
+#include "ext4_extents.h" /* Needed for trace points definition */
+#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "namei.h"
-#include "group.h"
+#include "mballoc.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
- unsigned int);
-static void ext4_commit_super (struct super_block * sb,
- struct ext4_super_block * es,
- int sync);
-static void ext4_mark_recovery_complete(struct super_block * sb,
- struct ext4_super_block * es);
-static void ext4_clear_journal_err(struct super_block * sb,
- struct ext4_super_block * es);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
+static int ext4_commit_super(struct super_block *sb, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block * sb, int errno,
- char nbuf[16]);
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext4_unlockfs(struct super_block *sb);
-static void ext4_write_super (struct super_block * sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int ext4_unfreeze(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext2");
+MODULE_ALIAS("ext2");
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext3",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext3");
+MODULE_ALIAS("ext3");
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
+
+static int ext4_verify_csum_type(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int offset = offsetof(struct ext4_super_block, s_checksum);
+ __u32 csum;
+
+ csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+ return cpu_to_le32(csum);
+}
+
+static int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
+}
+
+void ext4_kvfree(void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
return le32_to_cpu(bg->bg_block_bitmap_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}
ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -80,7 +201,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
{
return le32_to_cpu(bg->bg_inode_bitmap_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}
ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -88,7 +209,39 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
{
return le32_to_cpu(bg->bg_inode_table_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+__u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_itable_unused_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}
void ext4_block_bitmap_set(struct super_block *sb,
@@ -115,76 +268,107 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}
-/*
- * Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
{
- journal_t *journal;
-
- if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
- /* Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly. */
- journal = EXT4_SB(sb)->s_journal;
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, __FUNCTION__,
- "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
-
- return jbd2_journal_start(journal, nblocks);
+ bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
}
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-int __ext4_journal_stop(const char *where, handle_t *handle)
+void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
{
- struct super_block *sb;
- int err;
- int rc;
+ bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
- sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
- rc = jbd2_journal_stop(handle);
+void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
- if (!err)
- err = rc;
- if (err)
- __ext4_std_error(sb, where, err);
- return err;
+void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err)
+
+static void __save_error_info(struct super_block *sb, const char *func,
+ unsigned int line)
{
- char nbuf[16];
- const char *errstr = ext4_decode_error(NULL, err, nbuf);
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (bh)
- BUFFER_TRACE(bh, "abort");
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ es->s_last_error_time = cpu_to_le32(get_seconds());
+ strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+ es->s_last_error_line = cpu_to_le32(line);
+ if (!es->s_first_error_time) {
+ es->s_first_error_time = es->s_last_error_time;
+ strncpy(es->s_first_error_func, func,
+ sizeof(es->s_first_error_func));
+ es->s_first_error_line = cpu_to_le32(line);
+ es->s_first_error_ino = es->s_last_error_ino;
+ es->s_first_error_block = es->s_last_error_block;
+ }
+ /*
+ * Start the daily error reporting function if it hasn't been
+ * started already
+ */
+ if (!es->s_error_count)
+ mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+ le32_add_cpu(&es->s_error_count, 1);
+}
- if (!handle->h_err)
- handle->h_err = err;
+static void save_error_info(struct super_block *sb, const char *func,
+ unsigned int line)
+{
+ __save_error_info(sb, func, line);
+ ext4_commit_super(sb, 1);
+}
- if (is_handle_aborted(handle))
- return;
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else. Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+ struct inode *bd_inode = sb->s_bdev->bd_inode;
+ struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
- printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
- caller, errstr, err_fn);
+ return bdi->dev == NULL;
+}
- jbd2_journal_abort_handle(handle);
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int error = is_journal_aborted(journal);
+ struct ext4_journal_cb_entry *jce;
+
+ BUG_ON(txn->t_state == T_FINISHED);
+ spin_lock(&sbi->s_md_lock);
+ while (!list_empty(&txn->t_private_list)) {
+ jce = list_entry(txn->t_private_list.next,
+ struct ext4_journal_cb_entry, jce_list);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ jce->jce_func(sb, jce, error);
+ spin_lock(&sbi->s_md_lock);
+ }
+ spin_unlock(&sbi->s_md_lock);
}
/* Deal with the reporting of failure conditions on a filesystem such as
@@ -198,53 +382,128 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
* write out the superblock safely.
*
* We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead. On recovery, the journal will compain about
+ * the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
*/
static void ext4_handle_error(struct super_block *sb)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-
if (sb->s_flags & MS_RDONLY)
return;
- if (!test_opt (sb, ERRORS_CONT)) {
+ if (!test_opt(sb, ERRORS_CONT)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
- EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
if (journal)
jbd2_journal_abort(journal, -EIO);
}
- if (test_opt (sb, ERRORS_RO)) {
- printk (KERN_CRIT "Remounting filesystem read-only\n");
+ if (test_opt(sb, ERRORS_RO)) {
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
- ext4_commit_super(sb, es, 1);
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
-void ext4_error (struct super_block * sb, const char * function,
- const char * fmt, ...)
+#define ext4_error_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \
+ "EXT4-fs error")
+
+void __ext4_error(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
- va_end(args);
-
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+ sb->s_id, function, line, current->comm, &vaf);
+ va_end(args);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
-static const char *ext4_decode_error(struct super_block * sb, int errno,
- char nbuf[16])
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct va_format vaf;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+ es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+ }
+ save_error_info(inode->i_sb, function, line);
+ ext4_handle_error(inode->i_sb);
+}
+
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct va_format vaf;
+ struct ext4_super_block *es;
+ struct inode *inode = file_inode(file);
+ char pathname[80], *path;
+
+ es = EXT4_SB(inode->i_sb)->s_es;
+ es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
+ va_end(args);
+ }
+ save_error_info(inode->i_sb, function, line);
+ ext4_handle_error(inode->i_sb);
+}
+
+const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16])
{
char *errstr = NULL;
@@ -256,7 +515,8 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -279,8 +539,8 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
-void __ext4_std_error (struct super_block * sb, const char * function,
- int errno)
+void __ext4_std_error(struct super_block *sb, const char *function,
+ unsigned int line, int errno)
{
char nbuf[16];
const char *errstr;
@@ -292,10 +552,13 @@ void __ext4_std_error (struct super_block * sb, const char * function,
(sb->s_flags & MS_RDONLY))
return;
- errstr = ext4_decode_error(sb, errno, nbuf);
- printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
- sb->s_id, function, errstr);
+ if (ext4_error_ratelimit(sb)) {
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -309,45 +572,122 @@ void __ext4_std_error (struct super_block * sb, const char * function,
* case we take the easy way out and panic immediately.
*/
-void ext4_abort (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void __ext4_abort(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
{
va_list args;
- printk (KERN_CRIT "ext4_abort called.\n");
-
+ save_error_info(sb, function, line);
va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+ function, line);
vprintk(fmt, args);
printk("\n");
va_end(args);
+ if ((sb->s_flags & MS_RDONLY) == 0) {
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+ save_error_info(sb, function, line);
+ }
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs panic from previous error\n");
+}
- if (sb->s_flags & MS_RDONLY)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
return;
- printk(KERN_CRIT "Remounting filesystem read-only\n");
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- sb->s_flags |= MS_RDONLY;
- EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ va_end(args);
}
-void ext4_warning (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void __ext4_warning(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning"))
+ return;
+
va_start(args, fmt);
- printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
- sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+ sb->s_id, function, line, &vaf);
va_end(args);
}
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+ struct super_block *sb, ext4_group_t grp,
+ unsigned long ino, ext4_fsblk_t block,
+ const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+ struct va_format vaf;
+ va_list args;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ es->s_last_error_ino = cpu_to_le32(ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ __save_error_info(sb, function, line);
+
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+ sb->s_id, function, line, grp);
+ if (ino)
+ printk(KERN_CONT "inode %lu: ", ino);
+ if (block)
+ printk(KERN_CONT "block %llu:",
+ (unsigned long long) block);
+ printk(KERN_CONT "%pV\n", &vaf);
+ va_end(args);
+ }
+
+ if (test_opt(sb, ERRORS_CONT)) {
+ ext4_commit_super(sb, 0);
+ return;
+ }
+
+ ext4_unlock_group(sb, grp);
+ ext4_handle_error(sb);
+ /*
+ * We only get here in the ERRORS_RO case; relocking the group
+ * may be dangerous, but nothing bad will happen since the
+ * filesystem will have already been marked read/only and the
+ * journal has been aborted. We return 1 as a hint to callers
+ * who might what to use the return value from
+ * ext4_grp_locked_error() to distinguish between the
+ * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+ * aggressively from the ext4 function in question, with a
+ * more appropriate error code.
+ */
+ ext4_lock_group(sb, grp);
+ return;
+}
+
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -355,7 +695,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
return;
- ext4_warning(sb, __FUNCTION__,
+ ext4_warning(sb,
"updating to rev %d because of new feature flag, "
"running e2fsck is recommended",
EXT4_DYNAMIC_REV);
@@ -373,81 +713,21 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-int ext4_update_compat_feature(handle_t *handle,
- struct super_block *sb, __u32 compat)
-{
- int err = 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_COMPAT_FEATURE(sb, compat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat)
-{
- int err = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat)
-{
- int err = 0;
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
/*
* Open the external journal device
*/
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
fail:
- printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+ ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
__bdevname(dev, b), PTR_ERR(bdev));
return NULL;
}
@@ -455,23 +735,19 @@ fail:
/*
* Release the journal device
*/
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext4_blkdev_put(bdev);
+ ext4_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -483,8 +759,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
struct list_head *l;
- printk(KERN_ERR "sb orphan head is %d\n",
- le32_to_cpu(sbi->s_es->s_last_orphan));
+ ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+ le32_to_cpu(sbi->s_es->s_last_orphan));
printk(KERN_ERR "sb_info orphan list:\n");
list_for_each(l, &sbi->s_orphan) {
@@ -497,30 +773,54 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
}
}
-static void ext4_put_super (struct super_block * sb)
+static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int i;
+ int i, err;
+
+ ext4_unregister_li_request(sb);
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
+
+ if (sbi->s_journal) {
+ err = jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ if (err < 0)
+ ext4_abort(sb, "Couldn't clean up the journal");
+ }
+ ext4_es_unregister_shrinker(sbi);
+ del_timer_sync(&sbi->s_err_report);
+ ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
- jbd2_journal_destroy(sbi->s_journal);
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
- BUFFER_TRACE(sbi->s_sbh, "marking dirty");
- mark_buffer_dirty(sbi->s_sbh);
- ext4_commit_super(sb, es, 1);
}
+ if (!(sb->s_flags & MS_RDONLY))
+ ext4_commit_super(sb, 1);
+
+ if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ ext4_kvfree(sbi->s_group_desc);
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -546,9 +846,23 @@ static void ext4_put_super (struct super_block * sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
+ /*
+ * Now that we are completely done shutting down the
+ * superblock, we need to actually destroy the kobject.
+ */
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
- return;
}
static struct kmem_cache *ext4_inode_cachep;
@@ -563,44 +877,76 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- ei->i_acl = EXT4_ACL_NOT_CACHED;
- ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
- ei->i_block_alloc_info = NULL;
+
ei->vfs_inode.i_version = 1;
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ ei->i_da_metadata_calc_last_lblock = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
+ ei->jinode = NULL;
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ spin_lock_init(&ei->i_completed_io_lock);
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
+ atomic_set(&ei->i_ioend_count, 0);
+ atomic_set(&ei->i_unwritten, 0);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+
return &ei->vfs_inode;
}
+static int ext4_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext4_drop_inode(inode, drop);
+ return drop;
+}
+
+static void ext4_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
static void ext4_destroy_inode(struct inode *inode)
{
if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
- printk("EXT4 Inode %p: orphan list check failed!\n",
- EXT4_I(inode));
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): orphan list check failed!",
+ inode->i_ino, EXT4_I(inode));
print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
EXT4_I(inode), sizeof(struct ext4_inode_info),
true);
dump_stack();
}
- kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+ call_rcu(&inode->i_rcu, ext4_i_callback);
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
init_rwsem(&ei->xattr_sem);
-#endif
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
@@ -614,153 +960,32 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(ext4_inode_cachep);
}
-static void ext4_clear_inode(struct inode *inode)
+void ext4_clear_inode(struct inode *inode)
{
- struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- if (EXT4_I(inode)->i_acl &&
- EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
- posix_acl_release(EXT4_I(inode)->i_acl);
- EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
- }
- if (EXT4_I(inode)->i_default_acl &&
- EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
- posix_acl_release(EXT4_I(inode)->i_default_acl);
- EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
+ invalidate_inode_buffers(inode);
+ clear_inode(inode);
+ dquot_drop(inode);
+ ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode);
+ jbd2_free_inode(EXT4_I(inode)->jinode);
+ EXT4_I(inode)->jinode = NULL;
}
-#endif
- ext4_discard_reservation(inode);
- EXT4_I(inode)->i_block_alloc_info = NULL;
- if (unlikely(rsv))
- kfree(rsv);
-}
-
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_jquota_fmt)
- seq_printf(seq, ",jqfmt=%s",
- (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
-
- if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
- if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
- seq_puts(seq, ",usrquota");
-
- if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
- seq_puts(seq, ",grpquota");
-#endif
}
-/*
- * Show an option if
- * - it's set to a non-default value OR
- * - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
-{
- int def_errors;
- unsigned long def_mount_opts;
- struct super_block *sb = vfs->mnt_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
-
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- def_errors = le16_to_cpu(es->s_errors);
-
- if (sbi->s_sb_block != 1)
- seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
- if (test_opt(sb, MINIX_DF))
- seq_puts(seq, ",minixdf");
- if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",grpid");
- if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
- seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT4_DEF_RESUID ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
- }
- if (sbi->s_resgid != EXT4_DEF_RESGID ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
- }
- if (test_opt(sb, ERRORS_RO)) {
- if (def_errors == EXT4_ERRORS_PANIC ||
- def_errors == EXT4_ERRORS_CONTINUE) {
- seq_puts(seq, ",errors=remount-ro");
- }
- }
- if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
- seq_puts(seq, ",errors=continue");
- if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
- seq_puts(seq, ",errors=panic");
- if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
- seq_puts(seq, ",nouid32");
- if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
- seq_puts(seq, ",debug");
- if (test_opt(sb, OLDALLOC))
- seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
- if (test_opt(sb, XATTR_USER) &&
- !(def_mount_opts & EXT4_DEFM_XATTR_USER))
- seq_puts(seq, ",user_xattr");
- if (!test_opt(sb, XATTR_USER) &&
- (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
- seq_puts(seq, ",nouser_xattr");
- }
-#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",acl");
- if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
- seq_puts(seq, ",noacl");
-#endif
- if (!test_opt(sb, RESERVATION))
- seq_puts(seq, ",noreservation");
- if (sbi->s_commit_interval) {
- seq_printf(seq, ",commit=%u",
- (unsigned) (sbi->s_commit_interval / HZ));
- }
- if (test_opt(sb, BARRIER))
- seq_puts(seq, ",barrier=1");
- if (test_opt(sb, NOBH))
- seq_puts(seq, ",nobh");
- if (!test_opt(sb, EXTENTS))
- seq_puts(seq, ",noextents");
- if (!test_opt(sb, MBALLOC))
- seq_puts(seq, ",nomballoc");
- if (test_opt(sb, I_VERSION))
- seq_puts(seq, ",i_version");
-
- if (sbi->s_stripe)
- seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
- /*
- * journal mode get enabled in different ways
- * So just print the value even if we didn't specify it
- */
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
- ext4_show_quota_options(seq, sb);
- return 0;
-}
-
-
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+ u64 ino, u32 generation)
{
struct inode *inode;
@@ -789,60 +1014,92 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
}
static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+ int fh_len, int fh_type)
{
return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
ext4_nfs_get_inode);
}
static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+ int fh_len, int fh_type)
{
return generic_fh_to_parent(sb, fid, fh_len, fh_type,
ext4_nfs_get_inode);
}
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device. Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+ gfp_t wait)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page,
+ wait & ~__GFP_WAIT);
+ return try_to_free_buffers(page);
+}
+
#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
-static int ext4_dquot_initialize(struct inode *inode, int type);
-static int ext4_dquot_drop(struct inode *inode);
static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
-static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+ struct path *path);
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id);
+static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags);
+static int ext4_enable_quotas(struct super_block *sb);
-static struct dquot_operations ext4_quota_operations = {
- .initialize = ext4_dquot_initialize,
- .drop = ext4_dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
+static const struct dquot_operations ext4_quota_operations = {
+ .get_reserved_space = ext4_get_reserved_space,
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
.mark_dirty = ext4_mark_dquot_dirty,
- .write_info = ext4_write_info
+ .write_info = ext4_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk
+ .quota_off = ext4_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
+};
+
+static const struct quotactl_ops ext4_qctl_sysfile_operations = {
+ .quota_on_meta = ext4_quota_on_sysfile,
+ .quota_off = ext4_quota_off_sysfile,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
};
#endif
@@ -851,20 +1108,39 @@ static const struct super_operations ext4_sops = {
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
- .delete_inode = ext4_delete_inode,
+ .drop_inode = ext4_drop_inode,
+ .evict_inode = ext4_evict_inode,
.put_super = ext4_put_super,
- .write_super = ext4_write_super,
.sync_fs = ext4_sync_fs,
- .write_super_lockfs = ext4_write_super_lockfs,
- .unlockfs = ext4_unlockfs,
+ .freeze_fs = ext4_freeze,
+ .unfreeze_fs = ext4_unfreeze,
+ .statfs = ext4_statfs,
+ .remount_fs = ext4_remount,
+ .show_options = ext4_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = ext4_quota_read,
+ .quota_write = ext4_quota_write,
+#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct super_operations ext4_nojournal_sops = {
+ .alloc_inode = ext4_alloc_inode,
+ .destroy_inode = ext4_destroy_inode,
+ .write_inode = ext4_write_inode,
+ .dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
+ .evict_inode = ext4_evict_inode,
+ .sync_fs = ext4_sync_fs_nojournal,
+ .put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
- .clear_inode = ext4_clear_inode,
.show_options = ext4_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext4_export_ops = {
@@ -876,20 +1152,26 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
- Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
- Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+ Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_dioread_nolock, Opt_dioread_lock,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bsd_df, "bsddf"},
{Opt_minix_df, "minixdf"},
{Opt_grpid, "grpid"},
@@ -903,49 +1185,71 @@ static match_table_t tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
- {Opt_nocheck, "nocheck"},
- {Opt_nocheck, "check=none"},
{Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
+ {Opt_removed, "oldalloc"},
+ {Opt_removed, "orlov"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
+ {Opt_noload, "norecovery"},
{Opt_noload, "noload"},
- {Opt_nobh, "nobh"},
- {Opt_bh, "bh"},
+ {Opt_removed, "nobh"},
+ {Opt_removed, "bh"},
{Opt_commit, "commit=%u"},
- {Opt_journal_update, "journal=update"},
- {Opt_journal_inum, "journal=%u"},
+ {Opt_min_batch_time, "min_batch_time=%u"},
+ {Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
{Opt_grpjquota, "grpjquota=%s"},
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
{Opt_grpquota, "grpquota"},
{Opt_noquota, "noquota"},
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
- {Opt_extents, "extents"},
- {Opt_noextents, "noextents"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
- {Opt_mballoc, "mballoc"},
- {Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
+ {Opt_removed, "mblk_io_submit"},
+ {Opt_removed, "nomblk_io_submit"},
+ {Opt_block_validity, "block_validity"},
+ {Opt_noblock_validity, "noblock_validity"},
+ {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+ {Opt_journal_ioprio, "journal_ioprio=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "dioread_lock"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+ {Opt_init_itable, "init_itable=%u"},
+ {Opt_init_itable, "init_itable"},
+ {Opt_noinit_itable, "noinit_itable"},
+ {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_removed, "check=none"}, /* mount option from ext2/3 */
+ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
+ {Opt_removed, "reservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
{Opt_err, NULL},
- {Opt_resize, "resize"},
};
static ext4_fsblk_t get_sb_block(void **data)
@@ -955,400 +1259,633 @@ static ext4_fsblk_t get_sb_block(void **data)
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
+
options += 3;
- /*todo: use simple_strtoll with >32bit ext4 */
+ /* TODO: use simple_strtoll with >32bit ext4 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
- printk("EXT4-fs: Invalid sb specification: %s\n",
+ printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
if (*options == ',')
options++;
*data = (void *) options;
+
return sb_block;
}
-static int parse_options (char *options, struct super_block *sb,
- unsigned int *inum, unsigned long *journal_devnum,
- ext4_fsblk_t *n_blocks_count, int is_remount)
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+ "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char * p;
- substring_t args[MAX_OPT_ARGS];
- int data_opt = 0;
- int option;
-#ifdef CONFIG_QUOTA
- int qtype;
char *qname;
-#endif
+ int ret = -1;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext4_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -1;
+ }
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
+ }
+ if (strchr(qname, '/')) {
+ ext4_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ goto errout;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ set_opt(sb, QUOTA);
+ return 1;
+errout:
+ kfree(qname);
+ return ret;
+}
- if (!options)
- return 1;
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
- while ((p = strsep (&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- clear_opt (sbi->s_mount_opt, MINIX_DF);
- break;
- case Opt_minix_df:
- set_opt (sbi->s_mount_opt, MINIX_DF);
- break;
- case Opt_grpid:
- set_opt (sbi->s_mount_opt, GRPID);
- break;
- case Opt_nogrpid:
- clear_opt (sbi->s_mount_opt, GRPID);
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resuid = option;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_resgid = option;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- set_opt (sbi->s_mount_opt, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt (sbi->s_mount_opt, NO_UID32);
- break;
- case Opt_nocheck:
- clear_opt (sbi->s_mount_opt, CHECK);
- break;
- case Opt_debug:
- set_opt (sbi->s_mount_opt, DEBUG);
- break;
- case Opt_oldalloc:
- set_opt (sbi->s_mount_opt, OLDALLOC);
- break;
- case Opt_orlov:
- clear_opt (sbi->s_mount_opt, OLDALLOC);
- break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
- case Opt_user_xattr:
- set_opt (sbi->s_mount_opt, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt (sbi->s_mount_opt, XATTR_USER);
- break;
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return -1;
+ }
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 1;
+}
+#endif
+
+#define MOPT_SET 0x0001
+#define MOPT_CLEAR 0x0002
+#define MOPT_NOSUPPORT 0x0004
+#define MOPT_EXPLICIT 0x0008
+#define MOPT_CLEAR_ERR 0x0010
+#define MOPT_GTE0 0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q 0
+#define MOPT_QFMT 0x0040
#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- printk("EXT4 (no)user_xattr options not supported\n");
- break;
+#define MOPT_Q MOPT_NOSUPPORT
+#define MOPT_QFMT MOPT_NOSUPPORT
#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
+#define MOPT_DATAJ 0x0080
+#define MOPT_NO_EXT2 0x0100
+#define MOPT_NO_EXT3 0x0200
+#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING 0x0400
+
+static const struct mount_opts {
+ int token;
+ int mount_opt;
+ int flags;
+} ext4_mount_opts[] = {
+ {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+ {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+ {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+ {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+ {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+ {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+ {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+ EXT4_MOUNT_JOURNAL_CHECKSUM),
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_CLEAR},
+ {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+ {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+ {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+ {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+ {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_GTE0},
+ {Opt_max_batch_time, 0, MOPT_GTE0},
+ {Opt_min_batch_time, 0, MOPT_GTE0},
+ {Opt_inode_readahead_blks, 0, MOPT_GTE0},
+ {Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_stripe, 0, MOPT_GTE0},
+ {Opt_resuid, 0, MOPT_GTE0},
+ {Opt_resgid, 0, MOPT_GTE0},
+ {Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_STRING},
+ {Opt_journal_ioprio, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+ MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+ {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+ {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
- case Opt_acl:
- case Opt_noacl:
- printk("EXT4 (no)acl options not supported\n");
- break;
+ {Opt_acl, 0, MOPT_NOSUPPORT},
+ {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
- case Opt_reservation:
- set_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_noreservation:
- clear_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_journal_update:
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
- a journal file here. For now, only allow the
- user to specify an existing inode to be the
- journal file. */
- if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
- return 0;
- }
- set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
- break;
- case Opt_journal_inum:
- if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *inum = option;
- break;
- case Opt_journal_dev:
- if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *journal_devnum = option;
- break;
- case Opt_journal_checksum:
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
- case Opt_journal_async_commit:
- set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
- case Opt_noload:
- set_opt (sbi->s_mount_opt, NOLOAD);
- break;
- case Opt_commit:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = JBD2_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * option;
- break;
- case Opt_data_journal:
- data_opt = EXT4_MOUNT_JOURNAL_DATA;
- goto datacheck;
- case Opt_data_ordered:
- data_opt = EXT4_MOUNT_ORDERED_DATA;
- goto datacheck;
- case Opt_data_writeback:
- data_opt = EXT4_MOUNT_WRITEBACK_DATA;
- datacheck:
- if (is_remount) {
- if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
- != data_opt) {
- printk(KERN_ERR
- "EXT4-fs: cannot change data "
- "mode on remount\n");
- return 0;
- }
- } else {
- sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
- sbi->s_mount_opt |= data_opt;
- }
- break;
+ {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+ {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+ {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+ {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ {Opt_usrjquota, 0, MOPT_Q},
+ {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_offusrjquota, 0, MOPT_Q},
+ {Opt_offgrpjquota, 0, MOPT_Q},
+ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+ substring_t *args, unsigned long *journal_devnum,
+ unsigned int *journal_ioprio, int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const struct mount_opts *m;
+ kuid_t uid;
+ kgid_t gid;
+ int arg = 0;
+
#ifdef CONFIG_QUOTA
- case Opt_usrjquota:
- qtype = USRQUOTA;
- goto set_qf_name;
- case Opt_grpjquota:
- qtype = GRPQUOTA;
-set_qf_name:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR
- "EXT4-fs: Cannot change journalled "
- "quota options when quota turned on.\n");
- return 0;
- }
- qname = match_strdup(&args[0]);
- if (!qname) {
- printk(KERN_ERR
- "EXT4-fs: not enough memory for "
- "storing quotafile name.\n");
- return 0;
- }
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- printk(KERN_ERR
- "EXT4-fs: %s quota file already "
- "specified.\n", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
- printk(KERN_ERR
- "EXT4-fs: quotafile must be on "
- "filesystem root.\n");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- return 0;
- }
- set_opt(sbi->s_mount_opt, QUOTA);
- break;
- case Opt_offusrjquota:
- qtype = USRQUOTA;
- goto clear_qf_name;
- case Opt_offgrpjquota:
- qtype = GRPQUOTA;
-clear_qf_name:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR "EXT4-fs: Cannot change "
- "journalled quota options when "
- "quota turned on.\n");
- return 0;
- }
- /*
- * The space will be released later when all options
- * are confirmed to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
- break;
- case Opt_jqfmt_vfsold:
- sbi->s_jquota_fmt = QFMT_VFS_OLD;
- break;
- case Opt_jqfmt_vfsv0:
- sbi->s_jquota_fmt = QFMT_VFS_V0;
- break;
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
- case Opt_noquota:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR "EXT4-fs: Cannot change quota "
- "options when quota turned on.\n");
- return 0;
- }
- clear_opt(sbi->s_mount_opt, QUOTA);
- clear_opt(sbi->s_mount_opt, USRQUOTA);
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
-#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- printk(KERN_ERR
- "EXT4-fs: journalled quota options not "
- "supported.\n");
- break;
- case Opt_noquota:
- break;
+ if (token == Opt_usrjquota)
+ return set_qf_name(sb, USRQUOTA, &args[0]);
+ else if (token == Opt_grpjquota)
+ return set_qf_name(sb, GRPQUOTA, &args[0]);
+ else if (token == Opt_offusrjquota)
+ return clear_qf_name(sb, USRQUOTA);
+ else if (token == Opt_offgrpjquota)
+ return clear_qf_name(sb, GRPQUOTA);
#endif
- case Opt_abort:
- set_opt(sbi->s_mount_opt, ABORT);
- break;
- case Opt_barrier:
- if (match_int(&args[0], &option))
- return 0;
- if (option)
- set_opt(sbi->s_mount_opt, BARRIER);
- else
- clear_opt(sbi->s_mount_opt, BARRIER);
- break;
- case Opt_ignore:
+ switch (token) {
+ case Opt_noacl:
+ case Opt_nouser_xattr:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+ break;
+ case Opt_sb:
+ return 1; /* handled by get_sb_block() */
+ case Opt_removed:
+ ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
+ return 1;
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
+ return 1;
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++)
+ if (token == m->token)
break;
- case Opt_resize:
- if (!is_remount) {
- printk("EXT4-fs: resize option only available "
- "for remount\n");
- return 0;
+
+ if (m->token == Opt_err) {
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+ }
+
+ if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext2", opt);
+ return -1;
+ }
+ if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext3", opt);
+ return -1;
+ }
+
+ if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
+ return -1;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks must be "
+ "0 or a power of 2 smaller than 2^31");
+ return -1;
+ }
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (token == Opt_resuid) {
+ uid = make_kuid(current_user_ns(), arg);
+ if (!uid_valid(uid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+ return -1;
+ }
+ sbi->s_resuid = uid;
+ } else if (token == Opt_resgid) {
+ gid = make_kgid(current_user_ns(), arg);
+ if (!gid_valid(gid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+ return -1;
+ }
+ sbi->s_resgid = gid;
+ } else if (token == Opt_journal_dev) {
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ *journal_devnum = arg;
+ } else if (token == Opt_journal_path) {
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext4_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return -1;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return -1;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return -1;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ } else if (token == Opt_journal_ioprio) {
+ if (arg > 7) {
+ ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ " (must be 0-7)");
+ return -1;
+ }
+ *journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (m->flags & MOPT_DATAJ) {
+ if (is_remount) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change data mode on remount");
+ return -1;
}
- if (match_int(&args[0], &option) != 0)
- return 0;
- *n_blocks_count = option;
- break;
- case Opt_nobh:
- set_opt(sbi->s_mount_opt, NOBH);
- break;
- case Opt_bh:
- clear_opt(sbi->s_mount_opt, NOBH);
- break;
- case Opt_extents:
- set_opt (sbi->s_mount_opt, EXTENTS);
- break;
- case Opt_noextents:
- clear_opt (sbi->s_mount_opt, EXTENTS);
- break;
- case Opt_i_version:
- set_opt(sbi->s_mount_opt, I_VERSION);
- sb->s_flags |= MS_I_VERSION;
- break;
- case Opt_mballoc:
- set_opt(sbi->s_mount_opt, MBALLOC);
- break;
- case Opt_nomballoc:
- clear_opt(sbi->s_mount_opt, MBALLOC);
- break;
- case Opt_stripe:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_stripe = option;
- break;
- default:
- printk (KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
+ } else {
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_mount_opt |= m->mount_opt;
+ }
+#ifdef CONFIG_QUOTA
+ } else if (m->flags & MOPT_QFMT) {
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
+ sbi->s_jquota_fmt = m->mount_opt;
+#endif
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
}
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
+ }
+ return 1;
+}
+
+static int parse_options(char *options, struct super_block *sb,
+ unsigned long *journal_devnum,
+ unsigned int *journal_ioprio,
+ int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, tokens, args);
+ if (handle_mount_opt(sb, p, token, args, journal_devnum,
+ journal_ioprio, is_remount) < 0)
+ return 0;
}
#ifdef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+ ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+ "feature is enabled");
+ return 0;
+ }
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
- sbi->s_qf_names[USRQUOTA])
- clear_opt(sbi->s_mount_opt, USRQUOTA);
-
- if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
- sbi->s_qf_names[GRPQUOTA])
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
-
- if ((sbi->s_qf_names[USRQUOTA] &&
- (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
- (sbi->s_qf_names[GRPQUOTA] &&
- (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
- printk(KERN_ERR "EXT4-fs: old and new quota "
- "format mixing.\n");
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+ clear_opt(sb, USRQUOTA);
+
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+ clear_opt(sb, GRPQUOTA);
+
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+ ext4_msg(sb, KERN_ERR, "old and new quota "
+ "format mixing");
return 0;
}
if (!sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT4-fs: journalled quota format "
- "not specified.\n");
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
+ "not specified");
return 0;
}
} else {
if (sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT4-fs: journalled quota format "
- "specified with no journalling "
- "enabled.\n");
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
+ "specified with no journaling "
+ "enabled");
return 0;
}
}
#endif
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+ if (blocksize < PAGE_CACHE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ return 0;
+ }
+ }
return 1;
}
+static inline void ext4_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+#endif
+}
+
+static const char *token2str(int token)
+{
+ const struct match_token *t;
+
+ for (t = tokens; t->token != Opt_err; t++)
+ if (t->token == token && !strchr(t->pattern, '='))
+ break;
+ return t->pattern;
+}
+
+/*
+ * Show an option if
+ * - it's set to a non-default value OR
+ * - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+ int nodefs)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+ const struct mount_opts *m;
+ char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+ if (sbi->s_sb_block != 1)
+ SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ int want_set = m->flags & MOPT_SET;
+ if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+ (m->flags & MOPT_CLEAR_ERR))
+ continue;
+ if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+ continue; /* skip if same as the default */
+ if ((want_set &&
+ (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+ continue; /* select Opt_noFoo vs Opt_Foo */
+ SEQ_OPTS_PRINT("%s", token2str(m->token));
+ }
+
+ if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
+ le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ SEQ_OPTS_PRINT("resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
+ if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
+ le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ SEQ_OPTS_PRINT("resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
+ def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+ if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+ SEQ_OPTS_PUTS("errors=remount-ro");
+ if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+ SEQ_OPTS_PUTS("errors=continue");
+ if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+ SEQ_OPTS_PUTS("errors=panic");
+ if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+ if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+ SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+ if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+ SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (sb->s_flags & MS_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
+ if (nodefs || sbi->s_stripe)
+ SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+ if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ SEQ_OPTS_PUTS("data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ SEQ_OPTS_PUTS("data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ SEQ_OPTS_PUTS("data=writeback");
+ }
+ if (nodefs ||
+ sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+ SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+ sbi->s_inode_readahead_blks);
+
+ if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+ (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+ SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+ if (nodefs || sbi->s_max_dir_size_kb)
+ SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+
+ ext4_show_quota_options(seq, sb);
+ return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ int rc;
+
+ seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+ rc = _ext4_show_options(seq, sb, 1);
+ seq_puts(seq, "\n");
+ return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, options_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+ .owner = THIS_MODULE,
+ .open = options_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -1356,107 +1893,188 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int res = 0;
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
- printk (KERN_ERR "EXT4-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ ext4_msg(sb, KERN_ERR, "revision level too high, "
+ "forcing read-only mode");
res = MS_RDONLY;
}
if (read_only)
- return res;
+ goto done;
if (!(sbi->s_mount_state & EXT4_VALID_FS))
- printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
- else if ((sbi->s_mount_state & EXT4_ERROR_FS))
- printk (KERN_WARNING
- "EXT4-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
+ else if (sbi->s_mount_state & EXT4_ERROR_FS)
+ ext4_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk (KERN_WARNING
- "EXT4-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
(le32_to_cpu(es->s_lastcheck) +
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk (KERN_WARNING
- "EXT4-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
-#if 0
- /* @@@ We _will_ want to clear the valid bit if we find
- * inconsistencies, to force a fsck at reboot. But for
- * a plain journaled filesystem we can keep it set as
- * valid forever! :)
- */
- es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
-#endif
+ ext4_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
+ if (!sbi->s_journal)
+ es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
- es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+ le16_add_cpu(&es->s_mnt_count, 1);
es->s_mtime = cpu_to_le32(get_seconds());
ext4_update_dynamic_rev(sb);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ if (sbi->s_journal)
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
+done:
if (test_opt(sb, DEBUG))
- printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
- "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
+ "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
sb->s_blocksize,
sbi->s_groups_count,
EXT4_BLOCKS_PER_GROUP(sb),
EXT4_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
+ sbi->s_mount_opt, sbi->s_mount_opt2);
+
+ cleancache_init_fs(sb);
+ return res;
+}
- printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
- if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
- char b[BDEVNAME_SIZE];
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups *new_groups;
+ int size;
- printk("external journal on %s\n",
- bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
- } else {
- printk("internal journal\n");
+ if (!sbi->s_log_groups_per_flex)
+ return 0;
+
+ size = ext4_flex_group(sbi, ngroup - 1) + 1;
+ if (size <= sbi->s_flex_groups_allocated)
+ return 0;
+
+ size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+ new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groups) {
+ ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+ size / (int) sizeof(struct flex_groups));
+ return -ENOMEM;
}
- return res;
+
+ if (sbi->s_flex_groups) {
+ memcpy(new_groups, sbi->s_flex_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups)));
+ ext4_kvfree(sbi->s_flex_groups);
+ }
+ sbi->s_flex_groups = new_groups;
+ sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ return 0;
}
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
- struct ext4_group_desc *gdp)
+static int ext4_fill_flex_info(struct super_block *sb)
{
- __u16 crc = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ ext4_group_t flex_group;
+ int i, err;
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
- if (sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- int offset = offsetof(struct ext4_group_desc, bg_checksum);
- __le32 le_group = cpu_to_le32(block_group);
+ err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+ if (err)
+ goto failed;
- crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
- crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
- crc = crc16(crc, (__u8 *)gdp, offset);
- offset += sizeof(gdp->bg_checksum); /* skip checksum */
- /* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
- crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+
+ flex_group = ext4_flex_group(sbi, i);
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic64_add(ext4_free_group_clusters(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
+ return 1;
+failed:
+ return 0;
+}
+
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ int offset;
+ __u16 crc = 0;
+ __le32 le_group = cpu_to_le32(block_group);
+
+ if ((sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+ /* Use new metadata_csum algorithm */
+ __le16 save_csum;
+ __u32 csum32;
+
+ save_csum = gdp->bg_checksum;
+ gdp->bg_checksum = 0;
+ csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ sizeof(le_group));
+ csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+ sbi->s_desc_size);
+ gdp->bg_checksum = save_csum;
+
+ crc = csum32 & 0xFFFF;
+ goto out;
+ }
+
+ /* old crc16 code */
+ offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+ crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+ crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+ crc = crc16(crc, (__u8 *)gdp, offset);
+ offset += sizeof(gdp->bg_checksum); /* skip checksum */
+ /* for checksum of struct ext4_group_desc do the rest...*/
+ if ((sbi->s_es->s_feature_incompat &
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ crc = crc16(crc, (__u8 *)gdp + offset,
+ le16_to_cpu(sbi->s_es->s_desc_size) -
+ offset);
+
+out:
return cpu_to_le16(crc);
}
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
- if ((sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
- (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+ block_group, gdp)))
return 0;
return 1;
}
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ if (!ext4_has_group_desc_csum(sb))
+ return;
+ gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
/* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+ ext4_group_t *first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1465,12 +2083,12 @@ static int ext4_check_descriptors(struct super_block *sb)
ext4_fsblk_t inode_bitmap;
ext4_fsblk_t inode_table;
int flexbg_flag = 0;
- ext4_group_t i;
+ ext4_group_t i, grp = sbi->s_groups_count;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
flexbg_flag = 1;
- ext4_debug ("Checking group descriptors");
+ ext4_debug("Checking group descriptors");
for (i = 0; i < sbi->s_groups_count; i++) {
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1481,47 +2099,53 @@ static int ext4_check_descriptors(struct super_block *sb)
last_block = first_block +
(EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ if ((grp == sbi->s_groups_count) &&
+ !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ grp = i;
+
block_bitmap = ext4_block_bitmap(sb, gdp);
- if (block_bitmap < first_block || block_bitmap > last_block)
- {
- ext4_error (sb, "ext4_check_descriptors",
- "Block bitmap for group %lu"
- " not in group (block %llu)!",
- i, block_bitmap);
+ if (block_bitmap < first_block || block_bitmap > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Block bitmap for group %u not in group "
+ "(block %llu)!", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
- if (inode_bitmap < first_block || inode_bitmap > last_block)
- {
- ext4_error (sb, "ext4_check_descriptors",
- "Inode bitmap for group %lu"
- " not in group (block %llu)!",
- i, inode_bitmap);
+ if (inode_bitmap < first_block || inode_bitmap > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode bitmap for group %u not in group "
+ "(block %llu)!", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
if (inode_table < first_block ||
- inode_table + sbi->s_itb_per_group - 1 > last_block)
- {
- ext4_error (sb, "ext4_check_descriptors",
- "Inode table for group %lu"
- " not in group (block %llu)!",
- i, inode_table);
+ inode_table + sbi->s_itb_per_group - 1 > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode table for group %u not in group "
+ "(block %llu)!", i, inode_table);
return 0;
}
- if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
- ext4_error(sb, __FUNCTION__,
- "Checksum for group %lu failed (%u!=%u)\n",
- i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
- gdp)), le16_to_cpu(gdp->bg_checksum));
- return 0;
+ ext4_lock_group(sb, i);
+ if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Checksum for group %u failed (%u!=%u)",
+ i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ gdp)), le16_to_cpu(gdp->bg_checksum));
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ext4_unlock_group(sb, i);
+ return 0;
+ }
}
+ ext4_unlock_group(sb, i);
if (!flexbg_flag)
first_block += EXT4_BLOCKS_PER_GROUP(sb);
}
+ if (NULL != first_not_zeroed)
+ *first_not_zeroed = grp;
- ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
- sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+ ext4_free_blocks_count_set(sbi->s_es,
+ EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
+ sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -1542,8 +2166,8 @@ static int ext4_check_descriptors(struct super_block *sb)
* e2fsck was run on this filesystem, and it must have already done the orphan
* inode cleanup for us, so we can safely abort without any further action.
*/
-static void ext4_orphan_cleanup (struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es)
{
unsigned int s_flags = sb->s_flags;
int nr_orphans = 0, nr_truncates = 0;
@@ -1556,23 +2180,31 @@ static void ext4_orphan_cleanup (struct super_block * sb,
}
if (bdev_read_only(sb->s_bdev)) {
- printk(KERN_ERR "EXT4-fs: write access "
- "unavailable, skipping orphan cleanup.\n");
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
return;
}
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
- sb->s_id);
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
sb->s_flags &= ~MS_RDONLY;
}
#ifdef CONFIG_QUOTA
@@ -1583,9 +2215,9 @@ static void ext4_orphan_cleanup (struct super_block * sb,
if (EXT4_SB(sb)->s_qf_names[i]) {
int ret = ext4_quota_on_mount(sb, i);
if (ret < 0)
- printk(KERN_ERR
- "EXT4-fs: Cannot turn on journalled "
- "quota: error %d\n", ret);
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: error %d", ret);
}
}
#endif
@@ -1593,26 +2225,31 @@ static void ext4_orphan_cleanup (struct super_block * sb,
while (es->s_last_orphan) {
struct inode *inode;
- if (!(inode =
- ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
es->s_last_orphan = 0;
break;
}
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- DQUOT_INIT(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
- printk(KERN_DEBUG
- "%s: truncating inode %lu to %Ld bytes\n",
- __FUNCTION__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
+ mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- printk(KERN_DEBUG
- "%s: deleting unreferenced inode %lu\n",
- __FUNCTION__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -1620,23 +2257,24 @@ static void ext4_orphan_cleanup (struct super_block * sb,
iput(inode); /* The delete magic happens here! */
}
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
if (nr_orphans)
- printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
- sb->s_id, PLURAL(nr_orphans));
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
if (nr_truncates)
- printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
- sb->s_id, PLURAL(nr_truncates));
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
- vfs_quota_off(sb, i);
+ dquot_quota_off(sb, i);
}
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
}
+
/*
* Maximal extent format file size.
* Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1644,17 +2282,23 @@ static void ext4_orphan_cleanup (struct super_block * sb,
* in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
* so that won't be a limiting factor.
*
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
/* small i_blocks in vfs inode? */
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
+ * CONFIG_LBDAF is not enabled implies the inode
* i_block represent total blocks in 512 bytes
* 32 == size of vfs inode i_blocks * 8
*/
@@ -1665,10 +2309,13 @@ static loff_t ext4_max_size(int blkbits)
upper_limit <<= blkbits;
}
- /* 32-bit extent-start container, ee_block */
- res = 1LL << 32;
+ /*
+ * 32-bit extent-start container, ee_block. We lower the maxbytes
+ * by one fs block, so ee_len can cover the extent of maximum file
+ * size
+ */
+ res = (1LL << 32) - 1;
res <<= blkbits;
- res -= 1;
/* Sanity check against vm- & vfs- imposed limits */
if (res > upper_limit)
@@ -1682,24 +2329,24 @@ static loff_t ext4_max_size(int blkbits)
* block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
* We need to be 1 filesystem block less than the 2^48 sector limit.
*/
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
loff_t upper_limit;
- /* This is calculated to be the largest file size for a
- * dense, bitmapped file such that the total number of
- * sectors in the file, including data and all indirect blocks,
- * does not exceed 2^48 -1
- * __u32 i_blocks_lo and _u16 i_blocks_high representing the
- * total number of 512 bytes blocks of the file
+ /* This is calculated to be the largest file size for a dense, block
+ * mapped file such that the file's total number of 512-byte sectors,
+ * including data and all indirect blocks, does not exceed (2^48 - 1).
+ *
+ * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+ * number of 512-byte sectors of the file.
*/
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
- * i_block represent total blocks in 512 bytes
- * 32 == size of vfs inode i_blocks * 8
+ * !has_huge_files or CONFIG_LBDAF not enabled implies that
+ * the inode i_block field represents total file blocks in
+ * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -1741,7 +2388,7 @@ static loff_t ext4_max_bitmap_size(int bits)
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
- ext4_fsblk_t logical_sb_block, int nr)
+ ext4_fsblk_t logical_sb_block, int nr)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t bg, first_meta_bg;
@@ -1755,6 +2402,17 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
bg = sbi->s_desc_per_block * nr;
if (ext4_bg_has_super(sb, bg))
has_super = 1;
+
+ /*
+ * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+ * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled
+ * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+ * compensate.
+ */
+ if (sb->s_blocksize == 1024 && nr == 0 &&
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+ has_super++;
+
return (has_super + ext4_group_first_block_no(sb, bg));
}
@@ -1774,63 +2432,999 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
unsigned long stripe_width =
le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+ int ret;
if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
- return sbi->s_stripe;
+ ret = sbi->s_stripe;
+ else if (stripe_width <= sbi->s_blocks_per_group)
+ ret = stripe_width;
+ else if (stride <= sbi->s_blocks_per_group)
+ ret = stride;
+ else
+ ret = 0;
+
+ /*
+ * If the stripe width is 1, this makes no sense and
+ * we set it to 0 to turn off stripe handling code.
+ */
+ if (ret <= 1)
+ ret = 0;
+
+ return ret;
+}
+
+/* sysfs supprt */
+
+struct ext4_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ const char *, size_t);
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
+};
+
+static int parse_strtoull(const char *buf,
+ unsigned long long max, unsigned long long *value)
+{
+ int ret;
+
+ ret = kstrtoull(skip_spaces(buf), 0, value);
+ if (!ret && *value > max)
+ ret = -EINVAL;
+ return ret;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *ui = t;
+ return count;
+}
+
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ int ret;
+
+ if (parse_strtoull(buf, -1ULL, &val))
+ return -EINVAL;
+ ret = ext4_reserve_clusters(sbi, val);
+
+ return ret ? ret : count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname) \
+ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+ inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
+ NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
+
+static struct attribute *ext4_feat_attrs[] = {
+ ATTR_LIST(lazy_itable_init),
+ ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
+ NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
+static void ext4_feat_release(struct kobject *kobj)
+{
+ complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+ .default_attrs = ext4_feat_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_feat_release,
+};
+
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't support bigalloc feature without "
+ "extents feature\n");
+ return 0;
+ }
+
+#ifndef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !readonly) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return 0;
+ }
+#endif /* CONFIG_QUOTA */
+ return 1;
+}
+
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+ struct super_block *sb = (struct super_block *) arg;
+ struct ext4_sb_info *sbi;
+ struct ext4_super_block *es;
+
+ sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+
+ if (es->s_error_count)
+ /* fsck newer than v1.41.13 is needed to clean this condition. */
+ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
+ le32_to_cpu(es->s_error_count));
+ if (es->s_first_error_time) {
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
+ sb->s_id, le32_to_cpu(es->s_first_error_time),
+ (int) sizeof(es->s_first_error_func),
+ es->s_first_error_func,
+ le32_to_cpu(es->s_first_error_line));
+ if (es->s_first_error_ino)
+ printk(": inode %u",
+ le32_to_cpu(es->s_first_error_ino));
+ if (es->s_first_error_block)
+ printk(": block %llu", (unsigned long long)
+ le64_to_cpu(es->s_first_error_block));
+ printk("\n");
+ }
+ if (es->s_last_error_time) {
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
+ sb->s_id, le32_to_cpu(es->s_last_error_time),
+ (int) sizeof(es->s_last_error_func),
+ es->s_last_error_func,
+ le32_to_cpu(es->s_last_error_line));
+ if (es->s_last_error_ino)
+ printk(": inode %u",
+ le32_to_cpu(es->s_last_error_ino));
+ if (es->s_last_error_block)
+ printk(": block %llu", (unsigned long long)
+ le64_to_cpu(es->s_last_error_block));
+ printk("\n");
+ }
+ mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+ struct ext4_group_desc *gdp = NULL;
+ ext4_group_t group, ngroups;
+ struct super_block *sb;
+ unsigned long timeout = 0;
+ int ret = 0;
+
+ sb = elr->lr_super;
+ ngroups = EXT4_SB(sb)->s_groups_count;
+
+ sb_start_write(sb);
+ for (group = elr->lr_next_group; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp) {
+ ret = 1;
+ break;
+ }
+
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ break;
+ }
+
+ if (group >= ngroups)
+ ret = 1;
+
+ if (!ret) {
+ timeout = jiffies;
+ ret = ext4_init_inode_table(sb, group,
+ elr->lr_timeout ? 0 : 1);
+ if (elr->lr_timeout == 0) {
+ timeout = (jiffies - timeout) *
+ elr->lr_sbi->s_li_wait_mult;
+ elr->lr_timeout = timeout;
+ }
+ elr->lr_next_sched = jiffies + elr->lr_timeout;
+ elr->lr_next_group = group + 1;
+ }
+ sb_end_write(sb);
+
+ return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+ struct ext4_sb_info *sbi;
+
+ if (!elr)
+ return;
+
+ sbi = elr->lr_sbi;
+
+ list_del(&elr->lr_request);
+ sbi->s_li_request = NULL;
+ kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+ mutex_lock(&ext4_li_mtx);
+ if (!ext4_li_info) {
+ mutex_unlock(&ext4_li_mtx);
+ return;
+ }
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
+}
+
+static struct task_struct *ext4_lazyinit_task;
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+ struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct list_head *pos, *n;
+ struct ext4_li_request *elr;
+ unsigned long next_wakeup, cur;
+
+ BUG_ON(NULL == eli);
+
+cont_thread:
+ while (true) {
+ next_wakeup = MAX_JIFFY_OFFSET;
+
+ mutex_lock(&eli->li_list_mtx);
+ if (list_empty(&eli->li_request_list)) {
+ mutex_unlock(&eli->li_list_mtx);
+ goto exit_thread;
+ }
+
+ list_for_each_safe(pos, n, &eli->li_request_list) {
+ elr = list_entry(pos, struct ext4_li_request,
+ lr_request);
+
+ if (time_after_eq(jiffies, elr->lr_next_sched)) {
+ if (ext4_run_li_request(elr) != 0) {
+ /* error, remove the lazy_init job */
+ ext4_remove_li_request(elr);
+ continue;
+ }
+ }
+
+ if (time_before(elr->lr_next_sched, next_wakeup))
+ next_wakeup = elr->lr_next_sched;
+ }
+ mutex_unlock(&eli->li_list_mtx);
+
+ try_to_freeze();
+
+ cur = jiffies;
+ if ((time_after_eq(cur, next_wakeup)) ||
+ (MAX_JIFFY_OFFSET == next_wakeup)) {
+ cond_resched();
+ continue;
+ }
+
+ schedule_timeout_interruptible(next_wakeup - cur);
+
+ if (kthread_should_stop()) {
+ ext4_clear_request_list();
+ goto exit_thread;
+ }
+ }
+
+exit_thread:
+ /*
+ * It looks like the request list is empty, but we need
+ * to check it under the li_list_mtx lock, to prevent any
+ * additions into it, and of course we should lock ext4_li_mtx
+ * to atomically free the list and ext4_li_info, because at
+ * this point another ext4 filesystem could be registering
+ * new one.
+ */
+ mutex_lock(&ext4_li_mtx);
+ mutex_lock(&eli->li_list_mtx);
+ if (!list_empty(&eli->li_request_list)) {
+ mutex_unlock(&eli->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
+ goto cont_thread;
+ }
+ mutex_unlock(&eli->li_list_mtx);
+ kfree(ext4_li_info);
+ ext4_li_info = NULL;
+ mutex_unlock(&ext4_li_mtx);
+
+ return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+ struct list_head *pos, *n;
+ struct ext4_li_request *elr;
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+ elr = list_entry(pos, struct ext4_li_request,
+ lr_request);
+ ext4_remove_li_request(elr);
+ }
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+ ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+ ext4_li_info, "ext4lazyinit");
+ if (IS_ERR(ext4_lazyinit_task)) {
+ int err = PTR_ERR(ext4_lazyinit_task);
+ ext4_clear_request_list();
+ kfree(ext4_li_info);
+ ext4_li_info = NULL;
+ printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
+ "initialization thread\n",
+ err);
+ return err;
+ }
+ ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+ return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+ ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+ struct ext4_group_desc *gdp = NULL;
+
+ for (group = 0; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp)
+ continue;
+
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ break;
+ }
+
+ return group;
+}
+
+static int ext4_li_info_new(void)
+{
+ struct ext4_lazy_init *eli = NULL;
+
+ eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+ if (!eli)
+ return -ENOMEM;
- if (stripe_width <= sbi->s_blocks_per_group)
- return stripe_width;
+ INIT_LIST_HEAD(&eli->li_request_list);
+ mutex_init(&eli->li_list_mtx);
- if (stride <= sbi->s_blocks_per_group)
- return stride;
+ eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+ ext4_li_info = eli;
return 0;
}
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+ ext4_group_t start)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr;
+
+ elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+ if (!elr)
+ return NULL;
+
+ elr->lr_super = sb;
+ elr->lr_sbi = sbi;
+ elr->lr_next_group = start;
+
+ /*
+ * Randomize first schedule time of the request to
+ * spread the inode table initialization requests
+ * better.
+ */
+ elr->lr_next_sched = jiffies + (prandom_u32() %
+ (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ return elr;
+}
+
+int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr = NULL;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ int ret = 0;
+
+ mutex_lock(&ext4_li_mtx);
+ if (sbi->s_li_request != NULL) {
+ /*
+ * Reset timeout so it can be computed again, because
+ * s_li_wait_mult might have changed.
+ */
+ sbi->s_li_request->lr_timeout = 0;
+ goto out;
+ }
+
+ if (first_not_zeroed == ngroups ||
+ (sb->s_flags & MS_RDONLY) ||
+ !test_opt(sb, INIT_INODE_TABLE))
+ goto out;
+
+ elr = ext4_li_request_new(sb, first_not_zeroed);
+ if (!elr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (NULL == ext4_li_info) {
+ ret = ext4_li_info_new();
+ if (ret)
+ goto out;
+ }
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+
+ sbi->s_li_request = elr;
+ /*
+ * set elr to NULL here since it has been inserted to
+ * the request_list and the removal and free of it is
+ * handled by ext4_clear_request_list from now on.
+ */
+ elr = NULL;
+
+ if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+ ret = ext4_run_lazyinit_thread();
+ if (ret)
+ goto out;
+ }
+out:
+ mutex_unlock(&ext4_li_mtx);
+ if (ret)
+ kfree(elr);
+ return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+ /*
+ * If thread exited earlier
+ * there's nothing to be done.
+ */
+ if (!ext4_li_info || !ext4_lazyinit_task)
+ return;
+
+ kthread_stop(ext4_lazyinit_task);
+}
+
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+ int ret = 1;
+ int compat, incompat;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ /* journal checksum v2 */
+ compat = 0;
+ incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ } else {
+ /* journal checksum v1 */
+ compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+ incompat = 0;
+ }
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ incompat);
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ ret = jbd2_journal_set_features(sbi->s_journal,
+ compat, 0,
+ incompat);
+ jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+ JBD2_FEATURE_INCOMPAT_CSUM_V2);
+ }
+
+ return ret;
+}
+
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc. This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time. We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock. If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int count_overhead(struct super_block *sb, ext4_group_t grp,
+ char *buf)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_fsblk_t first_block, last_block, b;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ int s, j, count = 0;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ sbi->s_itb_per_group + 2);
+
+ first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
+ (grp * EXT4_BLOCKS_PER_GROUP(sb));
+ last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ b = ext4_block_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_table(sb, gdp);
+ if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
+ for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
+ int c = EXT4_B2C(sbi, b - first_block);
+ ext4_set_bit(c, buf);
+ count++;
+ }
+ if (i != grp)
+ continue;
+ s = 0;
+ if (ext4_bg_has_super(sb, grp)) {
+ ext4_set_bit(s++, buf);
+ count++;
+ }
+ for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+ ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+ count++;
+ }
+ }
+ if (!count)
+ return 0;
+ return EXT4_CLUSTERS_PER_GROUP(sb) -
+ ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+}
+/*
+ * Compute the overhead and stash it in sbi->s_overhead
+ */
+int ext4_calculate_overhead(struct super_block *sb)
{
- struct buffer_head * bh;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ ext4_fsblk_t overhead = 0;
+ char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * Compute the overhead (FS structures). This is constant
+ * for a given filesystem unless the number of block groups
+ * changes so we cache the previous value until it does.
+ */
+
+ /*
+ * All of the blocks before first_data_block are overhead
+ */
+ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+ /*
+ * Add the overhead found in each block group
+ */
+ for (i = 0; i < ngroups; i++) {
+ int blks;
+
+ blks = count_overhead(sb, i, buf);
+ overhead += blks;
+ if (blks)
+ memset(buf, 0, PAGE_SIZE);
+ cond_resched();
+ }
+ /* Add the journal blocks as well */
+ if (sbi->s_journal)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+
+ sbi->s_overhead = overhead;
+ smp_wmb();
+ free_page((unsigned long) buf);
+ return 0;
+}
+
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+{
+ ext4_fsblk_t resv_clusters;
+
+ /*
+ * There's no need to reserve anything when we aren't using extents.
+ * The space estimates are exact, there are no unwritten extents,
+ * hole punching doesn't need new metadata... This is needed especially
+ * to keep ext2/3 backward compatibility.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ return 0;
+ /*
+ * By default we reserve 2% or 4096 clusters, whichever is smaller.
+ * This should cover the situations where we can not afford to run
+ * out of space like for example punch hole, or converting
+ * unwritten extents in delalloc path. In most cases such
+ * allocation would require 1, or 2 blocks, higher numbers are
+ * very rare.
+ */
+ resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+ EXT4_SB(sb)->s_cluster_bits;
+
+ do_div(resv_clusters, 50);
+ resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+ return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+ ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits;
+
+ if (count >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, count);
+ return 0;
+}
+
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+{
+ char *orig_data = kstrdup(data, GFP_KERNEL);
+ struct buffer_head *bh;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi;
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
unsigned long offset = 0;
- unsigned int journal_inum = 0;
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
- int ret = -EINVAL;
- int blocksize;
- int db_count;
- int i;
- int needs_recovery;
- __le32 features;
+ char *cp;
+ const char *descr;
+ int ret = -ENOMEM;
+ int blocksize, clustersize;
+ unsigned int db_count;
+ unsigned int i;
+ int needs_recovery, has_huge_files, has_bigalloc;
__u64 blocks_count;
- int err;
+ int err = 0;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ext4_group_t first_not_zeroed;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- return -ENOMEM;
+ goto out_free_orig;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ goto out_free_orig;
+ }
sb->s_fs_info = sbi;
- sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT4_DEF_RESUID;
- sbi->s_resgid = EXT4_DEF_RESGID;
+ sbi->s_sb = sb;
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
+ if (sb->s_bdev->bd_part)
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev->bd_part, sectors[1]);
- unlock_kernel();
+ /* Cleanup superblock name */
+ for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+ *cp = '!';
+ /* -EINVAL is default */
+ ret = -EINVAL;
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
- goto out_fail;
- }
-
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
goto out_fail;
}
@@ -1846,88 +3440,186 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logical_sb_block))) {
- printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
goto out_fail;
}
/*
* Note: s_es must be initialized as soon as possible because
* some ext4 macro-instructions depend on its value
*/
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT4_SUPER_MAGIC)
goto cantfind_ext4;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Load the checksum driver */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ ret = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto failed_mount;
+ }
+ }
+
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ silent = 1;
+ goto cantfind_ext4;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+ set_opt(sb, INIT_INODE_TABLE);
if (def_mount_opts & EXT4_DEFM_DEBUG)
- set_opt(sbi->s_mount_opt, DEBUG);
+ set_opt(sb, DEBUG);
if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
- set_opt(sbi->s_mount_opt, GRPID);
+ set_opt(sb, GRPID);
if (def_mount_opts & EXT4_DEFM_UID16)
- set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
- if (def_mount_opts & EXT4_DEFM_XATTR_USER)
- set_opt(sbi->s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- if (def_mount_opts & EXT4_DEFM_ACL)
- set_opt(sbi->s_mount_opt, POSIX_ACL);
+ set_opt(sb, NO_UID32);
+ /* xattr user namespace & acls are now defaulted on */
+ set_opt(sb, XATTR_USER);
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ set_opt(sb, POSIX_ACL);
#endif
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
- sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
- sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ set_opt(sb, ORDERED_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
- sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+ set_opt(sb, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
- set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sb, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
- set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ set_opt(sb, ERRORS_CONT);
else
- set_opt(sbi->s_mount_opt, ERRORS_RO);
+ set_opt(sb, ERRORS_RO);
+ if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+ set_opt(sb, BLOCK_VALIDITY);
+ if (def_mount_opts & EXT4_DEFM_DISCARD)
+ set_opt(sb, DISCARD);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- set_opt(sbi->s_mount_opt, RESERVATION);
+ if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+ set_opt(sb, BARRIER);
/*
- * turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
+ ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
+ set_opt(sb, DELALLOC);
+
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
*/
- set_opt(sbi->s_mount_opt, MBALLOC);
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
- NULL, 0))
+ if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+ &journal_devnum, &journal_ioprio, 0)) {
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ sbi->s_es->s_mount_opts);
+ }
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+ if (!parse_options((char *) data, sb, &journal_devnum,
+ &journal_ioprio, 0))
goto failed_mount;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+ "with data=journal disables delayed "
+ "allocation and O_DIRECT support!\n");
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ }
+
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- printk(KERN_WARNING
- "EXT4-fs warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING,
+ "feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
+
+ if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+ set_opt2(sb, HURD_COMPAT);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "The Hurd can't support 64-bit file systems");
+ goto failed_mount;
+ }
+ }
- /*
- * Since ext4 is still considered development code, we require
- * that the TEST_FILESYS flag in s->flags be set.
- */
- if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
- printk(KERN_WARNING "EXT4-fs: %s: not marked "
- "OK to use with test code.\n", sb->s_id);
- goto failed_mount;
+ if (IS_EXT2_SB(sb)) {
+ if (ext2_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT3_SB(sb)) {
+ if (ext3_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
}
/*
@@ -1935,72 +3627,48 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
- goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LSF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LSF.\n", sb->s_id);
- goto failed_mount;
- }
- }
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
- printk(KERN_ERR
- "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
- blocksize, sb->s_id);
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d", blocksize);
goto failed_mount;
}
if (sb->s_blocksize != blocksize) {
-
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
blocksize);
goto failed_mount;
}
- brelse (bh);
+ brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
bh = sb_bread(sb, logical_sb_block);
if (!bh) {
- printk(KERN_ERR
- "EXT4-fs: Can't read superblock on 2nd try.\n");
+ ext4_msg(sb, KERN_ERR,
+ "Can't read superblock on 2nd try");
goto failed_mount;
}
- es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *)(bh->b_data + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- printk (KERN_ERR
- "EXT4-fs: Magic mismatch, very weird !\n");
+ ext4_msg(sb, KERN_ERR,
+ "Magic mismatch, very weird!");
goto failed_mount;
}
}
- sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
- sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2011,30 +3679,33 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
(sbi->s_inode_size > blocksize)) {
- printk (KERN_ERR
- "EXT4-fs: unsupported inode size: %d\n",
- sbi->s_inode_size);
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
+ sbi->s_inode_size);
goto failed_mount;
}
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
}
+
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
!is_power_of_2(sbi->s_desc_size)) {
- printk(KERN_ERR
- "EXT4-fs: unsupported descriptor size %lu\n",
+ ext4_msg(sb, KERN_ERR,
+ "unsupported descriptor size %lu",
sbi->s_desc_size);
goto failed_mount;
}
} else
sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
goto cantfind_ext4;
+
sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext4;
@@ -2045,82 +3716,194 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
- for (i=0; i < 4; i++)
+
+ for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ }
+ }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT4-fs: #blocks per group too big: %lu\n",
- sbi->s_blocks_per_group);
- goto failed_mount;
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ if (has_bigalloc) {
+ if (clustersize < blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%d)", clustersize, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ } else {
+ if (clustersize != blocksize) {
+ ext4_warning(sb, "fragment/cluster size (%d) != "
+ "block size (%d)", clustersize,
+ blocksize);
+ clustersize = blocksize;
+ }
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / blocksize;
+
if (sbi->s_inodes_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT4-fs: #inodes per group too big: %lu\n",
- sbi->s_inodes_per_group);
+ ext4_msg(sb, KERN_ERR,
+ "#inodes per group too big: %lu",
+ sbi->s_inodes_per_group);
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT4-fs: filesystem on %s:"
- " too large to mount safely\n", sb->s_id);
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ err = generic_check_addressable(sb->s_blocksize_bits,
+ ext4_blocks_count(es));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "filesystem"
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
- printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
- "enabled\n");
+ ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
goto failed_mount;
}
if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext4;
- /* ensure blocks_count calculation below doesn't sign-extend */
- if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
- le32_to_cpu(es->s_first_data_block) + 1) {
- printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
- "first data block %u, blocks per group %lu\n",
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
+ /* check blocks count against device size */
+ blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ goto failed_mount;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
goto failed_mount;
}
blocks_count = (ext4_blocks_count(es) -
le32_to_cpu(es->s_first_data_block) +
EXT4_BLOCKS_PER_GROUP(sb) - 1);
do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", sbi->s_groups_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ goto failed_mount;
+ }
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
- GFP_KERNEL);
+ sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk (KERN_ERR "EXT4-fs: not enough memory\n");
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ ret = -ENOMEM;
goto failed_mount;
}
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc)
+ proc_create_data("options", S_IRUGO, sbi->s_proc,
+ &ext4_seq_options_fops, sb);
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
- printk (KERN_ERR "EXT4-fs: "
- "can't read group descriptor %d\n", i);
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
}
- if (!ext4_check_descriptors (sb)) {
- printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+ if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "unable to initialize "
+ "flex_bg meta info!");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
+ init_timer(&sbi->s_err_report);
+ sbi->s_err_report.function = print_daily_error_info;
+ sbi->s_err_report.data = (unsigned long) sb;
+
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sbi);
+
+ err = percpu_counter_init(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
@@ -2129,37 +3912,41 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
+ }
if (err) {
- printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
}
- /* per fileystem reservation list head & lock */
- spin_lock_init(&sbi->s_rsv_window_lock);
- sbi->s_rsv_window_root = RB_ROOT;
- /* Add a single, static dummy reservation to the start of the
- * reservation window list --- it gives us a placeholder for
- * append-at-start-of-list which makes the allocation logic
- * _much_ simpler. */
- sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_alloc_hit = 0;
- sbi->s_rsv_window_head.rsv_goal_size = 0;
- ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_extent_max_zeroout_kb = 32;
/*
* set up enough so that it can read an inode
*/
- sb->s_op = &ext4_sops;
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ sb->s_op = &ext4_sops;
+ else
+ sb->s_op = &ext4_nojournal_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
- sb->s_qcop = &ext4_qctl_operations;
sb->dq_op = &ext4_quota_operations;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ sb->s_qcop = &ext4_qctl_sysfile_operations;
+ else
+ sb->s_qcop = &ext4_qctl_operations;
#endif
+ memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+ mutex_init(&sbi->s_orphan_lock);
sb->s_root = NULL;
@@ -2167,6 +3954,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_RECOVER));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ goto failed_mount3;
+
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
@@ -2175,37 +3967,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3;
- } else if (journal_inum) {
- if (ext4_create_journal(sb, es, journal_inum))
- goto failed_mount3;
+ } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_msg(sb, KERN_ERR, "required journal recovery "
+ "suppressed and not mounted read-only");
+ goto failed_mount_wq;
} else {
- if (!silent)
- printk (KERN_ERR
- "ext4: No journal on filesystem on %s\n",
- sb->s_id);
- goto failed_mount3;
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_journal = NULL;
+ needs_recovery = 0;
+ goto no_journal;
}
- if (ext4_blocks_count(es) > 0xffffffffULL &&
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
- printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
- goto failed_mount4;
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto failed_mount_wq;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
- jbd2_journal_clear_features(sbi->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto failed_mount_wq;
}
/* We have now updated the journal if required, so we can
@@ -2218,30 +4002,71 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
*/
if (jbd2_journal_check_available_features
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ set_opt(sb, ORDERED_DATA);
else
- set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ set_opt(sb, JOURNAL_DATA);
break;
case EXT4_MOUNT_ORDERED_DATA:
case EXT4_MOUNT_WRITEBACK_DATA:
if (!jbd2_journal_check_available_features
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- printk(KERN_ERR "EXT4-fs: Journal does not support "
- "requested data journaling mode\n");
- goto failed_mount4;
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto failed_mount_wq;
}
default:
break;
}
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
- if (test_opt(sb, NOBH)) {
- if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
- printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
- "its supported only with writeback mode\n");
- clear_opt(sbi->s_mount_opt, NOBH);
+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
+ /*
+ * The journal may have updated the bg summary counts, so we
+ * need to update the global counters.
+ */
+ percpu_counter_set(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
+ percpu_counter_set(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ percpu_counter_set(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
+
+no_journal:
+ if (ext4_mballoc_ready) {
+ sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
}
}
+
+ /*
+ * Get the # of file system overhead blocks from the
+ * superblock if present.
+ */
+ if (es->s_overhead_clusters)
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ else {
+ err = ext4_calculate_overhead(sb);
+ if (err)
+ goto failed_mount_wq;
+ }
+
+ /*
+ * The maximum number of concurrent works can be high and
+ * concurrency isn't really necessary. Limit it to 1.
+ */
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2249,24 +4074,25 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
root = ext4_iget(sb, EXT4_ROOT_INO);
if (IS_ERR(root)) {
- printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+ ext4_msg(sb, KERN_ERR, "get root inode failed");
ret = PTR_ERR(root);
+ root = NULL;
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+ ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
iput(root);
- printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
goto failed_mount4;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
- iput(root);
+ ext4_msg(sb, KERN_ERR, "get root dentry failed");
ret = -ENOMEM;
goto failed_mount4;
}
- ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -2289,52 +4115,144 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_inode_size) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
- printk(KERN_INFO "EXT4-fs: required extra inode space not"
- "available.\n");
+ ext4_msg(sb, KERN_INFO, "required extra inode space not"
+ "available");
}
- /*
- * akpm: core read_super() calls in here with the superblock locked.
- * That deadlocks, because orphan cleanup needs to lock the superblock
- * in numerous places. Here we just pop the lock - it's relatively
- * harmless, because we are now ready to accept write_super() requests,
- * and aviro says that's the only reason for hanging onto the
- * superblock lock.
- */
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+ "reserved pool", ext4_calculate_resv_clusters(sb));
+ goto failed_mount4a;
+ }
+
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+ err);
+ goto failed_mount5;
+ }
+
+ err = ext4_register_li_request(sb, first_not_zeroed);
+ if (err)
+ goto failed_mount6;
+
+ sbi->s_kobj.kset = ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto failed_mount7;
+
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto failed_mount8;
+ }
+#endif /* CONFIG_QUOTA */
+
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
- if (needs_recovery)
- printk (KERN_INFO "EXT4-fs: recovery complete.\n");
- ext4_mark_recovery_complete(sb, es);
- printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
- "writeback");
+ if (needs_recovery) {
+ ext4_msg(sb, KERN_INFO, "recovery complete");
+ ext4_mark_recovery_complete(sb, es);
+ }
+ if (EXT4_SB(sb)->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ if (test_opt(sb, DISCARD)) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ if (!blk_queue_discard(q))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
- ext4_ext_init(sb);
- ext4_mb_init(sb, needs_recovery);
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
- lock_kernel();
+ if (es->s_error_count)
+ mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+
+ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
+ kfree(orig_data);
return 0;
cantfind_ext4:
if (!silent)
- printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
- sb->s_id);
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+#ifdef CONFIG_QUOTA
+failed_mount8:
+ kobject_del(&sbi->s_kobj);
+#endif
+failed_mount7:
+ ext4_unregister_li_request(sb);
+failed_mount6:
+ ext4_mb_release(sb);
+failed_mount5:
+ ext4_ext_release(sb);
+ ext4_release_system_zone(sb);
+failed_mount4a:
+ dput(sb->s_root);
+ sb->s_root = NULL;
failed_mount4:
- jbd2_journal_destroy(sbi->s_journal);
+ ext4_msg(sb, KERN_ERR, "mount failed");
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+failed_mount_wq:
+ if (sbi->s_journal) {
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
failed_mount3:
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ ext4_es_unregister_shrinker(sbi);
+ del_timer_sync(&sbi->s_err_report);
+ if (sbi->s_flex_groups)
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+ ext4_kvfree(sbi->s_group_desc);
failed_mount:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
+ if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -2343,9 +4261,11 @@ failed_mount:
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
- lock_kernel();
- return ret;
+out_free_orig:
+ kfree(orig_data);
+ return err ? err : ret;
}
/*
@@ -2357,18 +4277,20 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sbi->s_commit_interval)
- journal->j_commit_interval = sbi->s_commit_interval;
- /* We could also set up an ext4-specific default for the commit
- * interval here, but for now we'll just fall back to the jbd
- * default. */
+ journal->j_commit_interval = sbi->s_commit_interval;
+ journal->j_min_batch_time = sbi->s_min_batch_time;
+ journal->j_max_batch_time = sbi->s_max_batch_time;
- spin_lock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- spin_unlock(&journal->j_state_lock);
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+ write_unlock(&journal->j_state_lock);
}
static journal_t *ext4_get_journal(struct super_block *sb,
@@ -2377,33 +4299,35 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
/* First, test for the existence of a valid inode on disk. Bad
* things happen if we iget() an unused inode, as the subsequent
* iput() will try to delete it. */
journal_inode = ext4_iget(sb, journal_inum);
if (IS_ERR(journal_inode)) {
- printk(KERN_ERR "EXT4-fs: no journal found.\n");
+ ext4_msg(sb, KERN_ERR, "no journal found");
return NULL;
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
- printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+ ext4_msg(sb, KERN_ERR, "journal inode is deleted");
return NULL;
}
- jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+ jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
- printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+ ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
}
journal = jbd2_journal_init_inode(journal_inode);
if (!journal) {
- printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+ ext4_msg(sb, KERN_ERR, "Could not load journal inode");
iput(journal_inode);
return NULL;
}
@@ -2415,32 +4339,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
static journal_t *ext4_get_dev_journal(struct super_block *sb,
dev_t j_dev)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
journal_t *journal;
ext4_fsblk_t start;
ext4_fsblk_t len;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
- struct ext4_super_block * es;
+ struct ext4_super_block *es;
struct block_device *bdev;
- bdev = ext4_blkdev_get(j_dev);
- if (bdev == NULL)
- return NULL;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
- if (bd_claim(bdev, sb)) {
- printk(KERN_ERR
- "EXT4: failed to claim external journal device.\n");
- blkdev_put(bdev);
+ bdev = ext4_blkdev_get(j_dev, sb);
+ if (bdev == NULL)
return NULL;
- }
blocksize = sb->s_blocksize;
- hblock = bdev_hardsect_size(bdev);
+ hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
- printk(KERN_ERR
- "EXT4-fs: blocksize too small for journal device.\n");
+ ext4_msg(sb, KERN_ERR,
+ "blocksize too small for journal device");
goto out_bdev;
}
@@ -2448,23 +4367,23 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
if (!(bh = __bread(bdev, sb_block, blocksize))) {
- printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
- "external journal\n");
+ ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+ "external journal");
goto out_bdev;
}
- es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext4_super_block *) (bh->b_data + offset);
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- printk(KERN_ERR "EXT4-fs: external journal has "
- "bad superblock\n");
+ ext4_msg(sb, KERN_ERR, "external journal has "
+ "bad superblock");
brelse(bh);
goto out_bdev;
}
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
- printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+ ext4_msg(sb, KERN_ERR, "journal UUID does not match");
brelse(bh);
goto out_bdev;
}
@@ -2476,25 +4395,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
start, len, blocksize);
if (!journal) {
- printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+ ext4_msg(sb, KERN_ERR, "failed to create device journal");
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
- printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+ ext4_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
- printk(KERN_ERR "EXT4-fs: External journal has more than one "
- "user (unsupported) - %d\n",
+ ext4_msg(sb, KERN_ERR, "External journal has more than one "
+ "user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
EXT4_SB(sb)->journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
+
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
@@ -2512,10 +4432,12 @@ static int ext4_load_journal(struct super_block *sb,
int err = 0;
int really_read_only;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- printk(KERN_INFO "EXT4-fs: external journal device major/minor "
- "numbers have changed\n");
+ ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+ "numbers have changed");
journal_dev = new_decode_dev(journal_devnum);
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2527,24 +4449,23 @@ static int ext4_load_journal(struct super_block *sb,
* crash? For recovery, we need to check in advance whether we
* can get read-write access to the device.
*/
-
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT4-fs: INFO: recovery "
- "required on readonly filesystem.\n");
+ ext4_msg(sb, KERN_INFO, "INFO: recovery "
+ "required on readonly filesystem");
if (really_read_only) {
- printk(KERN_ERR "EXT4-fs: write access "
- "unavailable, cannot proceed.\n");
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, cannot proceed");
return -EROFS;
}
- printk (KERN_INFO "EXT4-fs: write access will "
- "be enabled during recovery.\n");
+ ext4_msg(sb, KERN_INFO, "write access will "
+ "be enabled during recovery");
}
}
if (journal_inum && journal_dev) {
- printk(KERN_ERR "EXT4-fs: filesystem has both journal "
- "and inode journals!\n");
+ ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+ "and inode journals!");
return -EINVAL;
}
@@ -2556,22 +4477,25 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
- if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
- err = jbd2_journal_update_format(journal);
- if (err) {
- printk(KERN_ERR "EXT4-fs: error updating journal.\n");
- jbd2_journal_destroy(journal);
- return err;
- }
- }
+ if (!(journal->j_flags & JBD2_BARRIER))
+ ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
err = jbd2_journal_wipe(journal, !really_read_only);
- if (!err)
+ if (!err) {
+ char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+ if (save)
+ memcpy(save, ((char *) es) +
+ EXT4_S_ERR_START, EXT4_S_ERR_LEN);
err = jbd2_journal_load(journal);
+ if (save)
+ memcpy(((char *) es) + EXT4_S_ERR_START,
+ save, EXT4_S_ERR_LEN);
+ kfree(save);
+ }
if (err) {
- printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+ ext4_msg(sb, KERN_ERR, "error loading journal");
jbd2_journal_destroy(journal);
return err;
}
@@ -2579,98 +4503,109 @@ static int ext4_load_journal(struct super_block *sb,
EXT4_SB(sb)->s_journal = journal;
ext4_clear_journal_err(sb, es);
- if (journal_devnum &&
+ if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
- sb->s_dirt = 1;
/* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, es, 1);
- }
-
- return 0;
-}
-
-static int ext4_create_journal(struct super_block * sb,
- struct ext4_super_block * es,
- unsigned int journal_inum)
-{
- journal_t *journal;
- int err;
-
- if (sb->s_flags & MS_RDONLY) {
- printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
- "create journal.\n");
- return -EROFS;
- }
-
- journal = ext4_get_journal(sb, journal_inum);
- if (!journal)
- return -EINVAL;
-
- printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
- journal_inum);
-
- err = jbd2_journal_create(journal);
- if (err) {
- printk(KERN_ERR "EXT4-fs: error creating journal.\n");
- jbd2_journal_destroy(journal);
- return -EIO;
+ ext4_commit_super(sb, 1);
}
- EXT4_SB(sb)->s_journal = journal;
-
- ext4_update_dynamic_rev(sb);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
- es->s_journal_inum = cpu_to_le32(journal_inum);
- sb->s_dirt = 1;
-
- /* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, es, 1);
-
return 0;
}
-static void ext4_commit_super (struct super_block * sb,
- struct ext4_super_block * es,
- int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+ int error = 0;
- if (!sbh)
- return;
- es->s_wtime = cpu_to_le32(get_seconds());
- ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
- es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+ if (!sbh || block_device_ejected(sb))
+ return error;
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ ext4_msg(sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
+ if (sb->s_bdev->bd_part)
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
+ else
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+ ext4_free_blocks_count_set(es,
+ EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeclusters_counter)));
+ es->s_free_inodes_count =
+ cpu_to_le32(percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeinodes_counter));
BUFFER_TRACE(sbh, "marking dirty");
+ ext4_superblock_csum_set(sb);
mark_buffer_dirty(sbh);
- if (sync)
- sync_dirty_buffer(sbh);
+ if (sync) {
+ error = sync_dirty_buffer(sbh);
+ if (error)
+ return error;
+
+ error = buffer_write_io_error(sbh);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ }
+ return error;
}
-
/*
* Have we just finished recovery? If so, and if we are mounting (or
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
journal_t *journal = EXT4_SB(sb)->s_journal;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ BUG_ON(journal != NULL);
+ return;
+ }
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
- lock_super(sb);
+ if (jbd2_journal_flush(journal) < 0)
+ goto out;
+
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
sb->s_flags & MS_RDONLY) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- sb->s_dirt = 0;
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
}
- unlock_super(sb);
+
+out:
jbd2_journal_unlock_updates(journal);
}
@@ -2679,13 +4614,15 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
const char *errstr;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
journal = EXT4_SB(sb)->s_journal;
/*
@@ -2698,16 +4635,16 @@ static void ext4_clear_journal_err(struct super_block * sb,
char nbuf[16];
errstr = ext4_decode_error(sb, j_errno, nbuf);
- ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+ ext4_warning(sb, "Filesystem error recorded "
"from previous mount: %s", errstr);
- ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
- "filesystem check.");
+ ext4_warning(sb, "Marking fs in need of filesystem check.");
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super (sb, es, 1);
+ ext4_commit_super(sb, 1);
jbd2_journal_clear_err(journal);
+ jbd2_journal_update_sb_errno(journal);
}
}
@@ -2718,132 +4655,226 @@ static void ext4_clear_journal_err(struct super_block * sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- sb->s_dirt = 0;
- ret = ext4_journal_force_commit(journal);
- return ret;
-}
-
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point. Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
- */
-
-static void ext4_write_super (struct super_block * sb)
-{
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
+ return ext4_journal_force_commit(journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
{
+ int ret = 0;
tid_t target;
+ bool needs_barrier = false;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ /*
+ * Writeback quota in non-journalled quota case - journalled quota has
+ * no dirty dquots
+ */
+ dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
- sb->s_dirt = 0;
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
}
- return 0;
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
+ return ret;
}
/*
* LVM calls this function before a (read-only) snapshot is created. This
* gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
*/
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
{
- sb->s_dirt = 0;
+ int error = 0;
+ journal_t *journal;
- if (!(sb->s_flags & MS_RDONLY)) {
- journal_t *journal = EXT4_SB(sb)->s_journal;
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
- /* Now we set up the journal barrier. */
- jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
+ journal = EXT4_SB(sb)->s_journal;
- /* Journal blocked and flushed, clear needs_recovery flag. */
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
- }
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
+
+ /*
+ * Don't clear the needs_recovery flag if we failed to flush
+ * the journal.
+ */
+ error = jbd2_journal_flush(journal);
+ if (error < 0)
+ goto out;
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ error = ext4_commit_super(sb, 1);
+out:
+ /* we rely on upper layer to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
}
/*
* Called by LVM after the snapshot is done. We need to reset the RECOVER
* flag here, even though the filesystem is not technically dirty yet.
*/
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
{
- if (!(sb->s_flags & MS_RDONLY)) {
- lock_super(sb);
- /* Reser the needs_recovery flag before the fs is unlocked. */
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
- unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
- }
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ /* Reset the needs_recovery flag before the fs is unlocked. */
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, 1);
+ return 0;
}
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+ unsigned long s_mount_opt;
+ unsigned long s_mount_opt2;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_commit_interval;
+ u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
- struct ext4_super_block * es;
+ struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
- int err;
+ int enable_quota = 0;
+ ext4_group_t g;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ int err = 0;
#ifdef CONFIG_QUOTA
- int i;
+ int i, j;
#endif
+ char *orig_data = kstrdup(data, GFP_KERNEL);
/* Store the original options */
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
+ old_opts.s_mount_opt2 = sbi->s_mount_opt2;
old_opts.s_resuid = sbi->s_resuid;
old_opts.s_resgid = sbi->s_resgid;
old_opts.s_commit_interval = sbi->s_commit_interval;
+ old_opts.s_min_batch_time = sbi->s_min_batch_time;
+ old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ kfree(orig_data);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
/*
* Allow the "check" option to be passed as a remount option.
*/
- if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
- ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ext4_abort(sb, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
es = sbi->s_es;
- ext4_init_journal_params(sb, sbi->s_journal);
+ if (sbi->s_journal) {
+ ext4_init_journal_params(sb, sbi->s_journal);
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ }
- if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
- n_blocks_count > ext4_blocks_count(es)) {
- if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
goto restore_opts;
}
if (*flags & MS_RDONLY) {
+ err = sync_filesystem(sb);
+ if (err < 0)
+ goto restore_opts;
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+
/*
* First of all, the unconditional stuff we have to do
* to disable replay of the journal when we next remount
@@ -2859,24 +4890,31 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- /*
- * We have to unlock super so that we can wait for
- * transactions.
- */
- unlock_super(sb);
- ext4_mark_recovery_complete(sb, es);
- lock_super(sb);
+ if (sbi->s_journal)
+ ext4_mark_recovery_complete(sb, es);
} else {
- __le32 ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- printk(KERN_WARNING "EXT4-fs: %s: couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
+ /*
+ * Make sure the group descriptor checksums
+ * are sane. If they aren't, refuse to remount r/w.
+ */
+ for (g = 0; g < sbi->s_groups_count; g++) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, g, NULL);
+
+ if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
+ ext4_msg(sb, KERN_ERR,
+ "ext4_remount: Checksum for group %u failed (%u!=%u)",
+ g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ le16_to_cpu(gdp->bg_checksum));
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
/*
* If we have an unprocessed orphan list hanging
@@ -2884,11 +4922,10 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
* require a full umount/remount for now.
*/
if (es->s_last_orphan) {
- printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+ ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
- "umount/remount instead.\n",
- sb->s_id);
+ "umount/remount instead");
err = -EINVAL;
goto restore_opts;
}
@@ -2899,112 +4936,119 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- ext4_clear_journal_err(sb, es);
+ if (sbi->s_journal)
+ ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if ((err = ext4_group_extend(sb, es, n_blocks_count)))
- goto restore_opts;
- if (!ext4_setup_super (sb, es, 0))
+ if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block))) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+ enable_quota = 1;
}
}
+
+ /*
+ * Reinitialize lazy itable initialization thread based on
+ * current settings
+ */
+ if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+ ext4_unregister_li_request(sb);
+ else {
+ ext4_group_t first_not_zeroed;
+ first_not_zeroed = ext4_has_uninit_itable(sb);
+ ext4_register_li_request(sb, first_not_zeroed);
+ }
+
+ ext4_setup_system_zone(sb);
+ if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+ ext4_commit_super(sb, 1);
+
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
+ if (enable_quota) {
+ if (sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
+ else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto restore_opts;
+ }
+ }
#endif
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+ kfree(orig_data);
return 0;
+
restore_opts:
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
+ sbi->s_mount_opt2 = old_opts.s_mount_opt2;
sbi->s_resuid = old_opts.s_resuid;
sbi->s_resgid = old_opts.s_resgid;
sbi->s_commit_interval = old_opts.s_commit_interval;
+ sbi->s_min_batch_time = old_opts.s_min_batch_time;
+ sbi->s_max_batch_time = old_opts.s_max_batch_time;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
+ kfree(orig_data);
return err;
}
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
+ s64 bfree;
+ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
- if (test_opt(sb, MINIX_DF)) {
- sbi->s_overhead_last = 0;
- } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
- ext4_group_t ngroups = sbi->s_groups_count, i;
- ext4_fsblk_t overhead = 0;
- smp_rmb();
-
- /*
- * Compute the overhead (FS structures). This is constant
- * for a given filesystem unless the number of block groups
- * changes so we cache the previous value until it does.
- */
-
- /*
- * All of the blocks before first_data_block are
- * overhead
- */
- overhead = le32_to_cpu(es->s_first_data_block);
-
- /*
- * Add the overhead attributed to the superblock and
- * block group descriptors. If the sparse superblocks
- * feature is turned on, then not all groups have this.
- */
- for (i = 0; i < ngroups; i++) {
- overhead += ext4_bg_has_super(sb, i) +
- ext4_bg_num_gdb(sb, i);
- cond_resched();
- }
-
- /*
- * Every block group has an inode bitmap, a block
- * bitmap, and an inode table.
- */
- overhead += ngroups * (2 + sbi->s_itb_per_group);
- sbi->s_overhead_last = overhead;
- smp_wmb();
- sbi->s_blocks_last = ext4_blocks_count(es);
- }
+ if (!test_opt(sb, MINIX_DF))
+ overhead = sbi->s_overhead;
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
- ext4_free_blocks_count_set(es, buf->f_bfree);
- buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
- if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
+ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+ /* prevent underflow in case that few free space is available */
+ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+ buf->f_bavail = buf->f_bfree -
+ (ext4_r_blocks_count(es) + resv_blocks);
+ if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
return 0;
}
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
* Process 1 Process 2
* ext4_create() quota_sync()
- * jbd2_journal_start() write_dquot()
- * DQUOT_INIT() down(dqio_mutex)
+ * jbd2_journal_start() write_dquot()
+ * dquot_initialize() down(dqio_mutex)
* down(dqio_mutex) jbd2_journal_start()
*
*/
@@ -3013,39 +5057,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
-}
-
-static int ext4_dquot_initialize(struct inode *inode, int type)
-{
- handle_t *handle;
- int ret, err;
-
- /* We may create quota structure so we need to reserve enough blocks */
- handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_initialize(inode, type);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-static int ext4_dquot_drop(struct inode *inode)
-{
- handle_t *handle;
- int ret, err;
-
- /* We may delete quota structure so we need to reserve enough blocks */
- handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_drop(inode);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}
static int ext4_write_dquot(struct dquot *dquot)
@@ -3055,8 +5067,8 @@ static int ext4_write_dquot(struct dquot *dquot)
struct inode *inode;
inode = dquot_to_inode(dquot);
- handle = ext4_journal_start(inode,
- EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
@@ -3071,8 +5083,8 @@ static int ext4_acquire_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+ EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
@@ -3087,8 +5099,8 @@ static int ext4_release_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+ EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
@@ -3103,9 +5115,12 @@ static int ext4_release_dquot(struct dquot *dquot)
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
- /* Are we journalling quotas? */
- if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ struct super_block *sb = dquot->dq_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Are we journaling quotas? */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -3119,7 +5134,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -3135,45 +5150,164 @@ static int ext4_write_info(struct super_block *sb, int type)
*/
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
- return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
- EXT4_SB(sb)->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+ EXT4_SB(sb)->s_jquota_fmt, type);
}
/*
* Standard function to be called on quota_on
*/
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *path)
+ struct path *path)
{
int err;
- struct nameidata nd;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* Not journalling quota? */
- if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
- !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
- return vfs_quota_on(sb, type, format_id, path);
- err = path_lookup(path, LOOKUP_FOLLOW, &nd);
- if (err)
- return err;
+
/* Quotafile not on the same filesystem? */
- if (nd.path.mnt->mnt_sb != sb) {
- path_put(&nd.path);
+ if (path->dentry->d_sb != sb)
return -EXDEV;
+ /* Journaling quota? */
+ if (EXT4_SB(sb)->s_qf_names[type]) {
+ /* Quotafile not in fs root? */
+ if (path->dentry->d_parent != sb->s_root)
+ ext4_msg(sb, KERN_WARNING,
+ "Quota file not on filesystem root. "
+ "Journaled quota will not work");
}
- /* Quotafile not of fs root? */
- if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
- printk(KERN_WARNING
- "EXT4-fs: Quota file not on filesystem root. "
- "Journalled quota will not work.\n");
- path_put(&nd.path);
- return vfs_quota_on(sb, type, format_id, path);
+
+ /*
+ * When we journal data on quota file, we have to flush journal to see
+ * all updates to the file when we bypass pagecache...
+ */
+ if (EXT4_SB(sb)->s_journal &&
+ ext4_should_journal_data(path->dentry->d_inode)) {
+ /*
+ * We don't need to lock updates but journal_flush() could
+ * otherwise be livelocked...
+ */
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err)
+ return err;
+ }
+
+ return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+ struct inode *qf_inode;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+
+ if (!qf_inums[type])
+ return -EPERM;
+
+ qf_inode = ext4_iget(sb, qf_inums[type]);
+ if (IS_ERR(qf_inode)) {
+ ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ return PTR_ERR(qf_inode);
+ }
+
+ /* Don't account quota for quota files to avoid recursion */
+ qf_inode->i_flags |= S_NOQUOTA;
+ err = dquot_enable(qf_inode, type, format_id, flags);
+ iput(qf_inode);
+
+ return err;
+}
+
+/* Enable usage tracking for all quota types. */
+static int ext4_enable_quotas(struct super_block *sb)
+{
+ int type, err = 0;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (qf_inums[type]) {
+ err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
+ DQUOT_USAGE_ENABLED);
+ if (err) {
+ ext4_warning(sb,
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * quota_on function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /*
+ * USAGE was enabled at mount time. Only need to enable LIMITS now.
+ */
+ return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
+}
+
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ handle_t *handle;
+
+ /* Force all delayed allocation blocks to be allocated.
+ * Caller already holds s_umount sem */
+ if (test_opt(sb, DELALLOC))
+ sync_filesystem(sb);
+
+ if (!inode)
+ goto out;
+
+ /* Update modification times of quota files when userspace can
+ * start looking at them */
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+ if (IS_ERR(handle))
+ goto out;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out:
+ return dquot_quota_off(sb, type);
+}
+
+/*
+ * quota_off function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_off_sysfile(struct super_block *sb, int type)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /* Disable only the limits. */
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
* we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
@@ -3220,117 +5354,259 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
int err = 0;
int offset = off & (sb->s_blocksize - 1);
- int tocopy;
- int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (!handle) {
- printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
- " cancelled because transaction is not started.\n",
+ if (EXT4_SB(sb)->s_journal && !handle) {
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
- bh = ext4_bread(handle, inode, blk, 1, &err);
- if (!bh)
- goto out;
- if (journal_quota) {
- err = ext4_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
- }
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext4_journal_dirty_metadata(handle, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
- }
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+
+ bh = ext4_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
brelse(bh);
- if (err)
- goto out;
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
+ goto out;
}
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ brelse(bh);
out:
- if (len == towrite)
+ if (err)
return err;
- if (inode->i_size < off+len-towrite) {
- i_size_write(inode, off+len-towrite);
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ext4_mark_inode_dirty(handle, inode);
- mutex_unlock(&inode->i_mutex);
- return len - towrite;
+ return len;
}
#endif
-static int ext4_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
+}
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext2(void)
+{
+ int err = register_filesystem(&ext2_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+ unregister_filesystem(&ext2_fs_type);
}
-static struct file_system_type ext4dev_fs_type = {
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+ int err = register_filesystem(&ext3_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+ unregister_filesystem(&ext3_fs_type);
+}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
- .name = "ext4dev",
- .get_sb = ext4_get_sb,
+ .name = "ext4",
+ .mount = ext4_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("ext4");
-static int __init init_ext4_fs(void)
+static int __init ext4_init_feat_adverts(void)
{
- int err;
+ struct ext4_features *ef;
+ int ret = -ENOMEM;
+
+ ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+ if (!ef)
+ goto out;
- err = init_ext4_mballoc();
+ ef->f_kobj.kset = ext4_kset;
+ init_completion(&ef->f_kobj_unregister);
+ ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+ "features");
+ if (ret) {
+ kfree(ef);
+ goto out;
+ }
+
+ ext4_feat = ef;
+ ret = 0;
+out:
+ return ret;
+}
+
+static void ext4_exit_feat_adverts(void)
+{
+ kobject_put(&ext4_feat->f_kobj);
+ wait_for_completion(&ext4_feat->f_kobj_unregister);
+ kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+static int __init ext4_init_fs(void)
+{
+ int i, err;
+
+ ext4_li_info = NULL;
+ mutex_init(&ext4_li_mtx);
+
+ /* Build-time check for flags consistency */
+ ext4_check_flag_values();
+
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+ mutex_init(&ext4__aio_mutex[i]);
+ init_waitqueue_head(&ext4__ioend_wq[i]);
+ }
+
+ err = ext4_init_es();
if (err)
return err;
- err = init_ext4_xattr();
+ err = ext4_init_pageio();
+ if (err)
+ goto out7;
+
+ err = ext4_init_system_zone();
+ if (err)
+ goto out6;
+ ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+ if (!ext4_kset) {
+ err = -ENOMEM;
+ goto out5;
+ }
+ ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+
+ err = ext4_init_feat_adverts();
+ if (err)
+ goto out4;
+
+ err = ext4_init_mballoc();
if (err)
goto out2;
+ else
+ ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
- err = register_filesystem(&ext4dev_fs_type);
+ register_as_ext3();
+ register_as_ext2();
+ err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
+
return 0;
out:
+ unregister_as_ext2();
+ unregister_as_ext3();
destroy_inodecache();
out1:
- exit_ext4_xattr();
+ ext4_mballoc_ready = 0;
+ ext4_exit_mballoc();
out2:
- exit_ext4_mballoc();
+ ext4_exit_feat_adverts();
+out4:
+ if (ext4_proc_root)
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+out5:
+ ext4_exit_system_zone();
+out6:
+ ext4_exit_pageio();
+out7:
+ ext4_exit_es();
+
return err;
}
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
{
- unregister_filesystem(&ext4dev_fs_type);
+ ext4_destroy_lazyinit_thread();
+ unregister_as_ext2();
+ unregister_as_ext3();
+ unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
- exit_ext4_xattr();
- exit_ext4_mballoc();
+ ext4_exit_mballoc();
+ ext4_exit_feat_adverts();
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+ ext4_exit_system_zone();
+ ext4_exit_pageio();
+ ext4_exit_es();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
-module_exit(exit_ext4_fs)
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e6f9da4287c..ff371193201 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -19,14 +19,14 @@
#include <linux/fs.h>
#include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
#include <linux/namei.h>
+#include "ext4.h"
#include "xattr.h"
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
- nd_set_link(nd, (char*)ei->i_data);
+ nd_set_link(nd, (char *) ei->i_data);
return NULL;
}
@@ -34,21 +34,19 @@ const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 00000000000..011ba6670d9
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+ ext4_lblk_t needed;
+
+ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+ /* Give ourselves just enough room to cope with inodes in which
+ * i_blocks is corrupt: we've seen disk corruptions in the past
+ * which resulted in random data in an inode which looked enough
+ * like a regular file for ext4 to try to delete it. Things
+ * will go a bit crazy if that happens, but at least we should
+ * try not to panic the whole kernel. */
+ if (needed < 2)
+ needed = 2;
+
+ /* But we need to bound the transaction so we don't overflow the
+ * journal. */
+ if (needed > EXT4_MAX_TRANS_DATA)
+ needed = EXT4_MAX_TRANS_DATA;
+
+ return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7962139c01..e7387337060 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -53,19 +53,14 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <linux/rwsem.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
#include "xattr.h"
#include "acl.h"
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -82,48 +77,101 @@
printk("\n"); \
} while (0)
#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct ext4_xattr_header *,
struct mb_cache_entry **);
static void ext4_xattr_rehash(struct ext4_xattr_header *,
struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
-
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
- [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
};
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- &ext4_xattr_acl_access_handler,
- &ext4_xattr_acl_default_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
NULL
};
-static inline struct xattr_handler *
+#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_mb_cache)
+
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 save_csum;
+ __le64 dsk_block_nr = cpu_to_le64(block_nr);
+
+ save_csum = hdr->h_checksum;
+ hdr->h_checksum = 0;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+ EXT4_BLOCK_SIZE(inode->i_sb));
+
+ hdr->h_checksum = save_csum;
+ return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+ return 0;
+ return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+ sector_t block_nr,
+ struct ext4_xattr_header *hdr)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+ return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static inline const struct xattr_handler *
ext4_xattr_handler(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
handler = ext4_xattr_handler_map[name_index];
@@ -138,7 +186,7 @@ ext4_xattr_handler(int name_index)
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- return ext4_xattr_list(dentry->d_inode, buffer, size);
+ return ext4_xattr_list(dentry, buffer, size);
}
static int
@@ -154,14 +202,21 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
}
static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
{
int error;
+ if (buffer_verified(bh))
+ return 0;
+
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
return -EIO;
+ if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+ return -EIO;
error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ if (!error)
+ set_buffer_verified(bh);
return error;
}
@@ -211,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct ext4_xattr_entry *entry;
size_t size;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -218,20 +274,21 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -ENODATA;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
-bad_block: ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ if (ext4_xattr_check_block(inode, bh)) {
+bad_block:
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
if (error == -EIO)
@@ -253,7 +310,7 @@ cleanup:
return error;
}
-static int
+int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
@@ -265,7 +322,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return -ENODATA;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -312,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
+ if (strlen(name) > 255)
+ return -ERANGE;
+
down_read(&EXT4_I(inode)->xattr_sem);
error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size);
@@ -323,19 +383,20 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
}
static int
-ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
char *buffer, size_t buffer_size)
{
size_t rest = buffer_size;
for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
- struct xattr_handler *handler =
+ const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
if (handler) {
- size_t size = handler->list(inode, buffer, rest,
+ size_t size = handler->list(dentry, buffer, rest,
entry->e_name,
- entry->e_name_len);
+ entry->e_name_len,
+ handler->flags);
if (buffer) {
if (size > rest)
return -ERANGE;
@@ -348,10 +409,12 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
}
static int
-ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -359,22 +422,22 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
error = 0;
if (!EXT4_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ if (ext4_xattr_check_block(inode, bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
- error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
+ error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
brelse(bh);
@@ -383,15 +446,16 @@ cleanup:
}
static int
-ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return 0;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -402,7 +466,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
error = ext4_xattr_check_names(IFIRST(header), end);
if (error)
goto cleanup;
- error = ext4_xattr_list_entries(inode, IFIRST(header),
+ error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
cleanup:
@@ -420,26 +484,26 @@ cleanup:
* Returns a negative error number on failure, or the number of bytes
* used / required on success.
*/
-int
-ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+static int
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- int i_error, b_error;
+ int ret, ret2;
- down_read(&EXT4_I(inode)->xattr_sem);
- i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
- if (i_error < 0) {
- b_error = 0;
- } else {
- if (buffer) {
- buffer += i_error;
- buffer_size -= i_error;
- }
- b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
- if (b_error < 0)
- i_error = 0;
+ down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ if (buffer) {
+ buffer += ret;
+ buffer_size -= ret;
}
- up_read(&EXT4_I(inode)->xattr_sem);
- return i_error + b_error;
+ ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ ret += ret2;
+errout:
+ up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ return ret;
}
/*
@@ -452,16 +516,16 @@ static void ext4_xattr_update_super_block(handle_t *handle,
if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
return;
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
- sb->s_dirt = 1;
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_super(handle, sb);
}
}
/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -469,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
{
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+ BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
@@ -480,22 +546,38 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ea_bdebug(bh, "refcount now=0; freeing");
if (ce)
mb_cache_entry_free(ce);
- ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
get_bh(bh);
- ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ unlock_buffer(bh);
+ ext4_free_blocks(handle, inode, bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
} else {
- BHDR(bh)->h_refcount = cpu_to_le32(
- le32_to_cpu(BHDR(bh)->h_refcount) - 1);
- error = ext4_journal_dirty_metadata(handle, bh);
+ le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+ if (ce)
+ mb_cache_entry_release(ce);
+ /*
+ * Beware of this ugliness: Releasing of xattr block references
+ * from different inodes can race and so we have to protect
+ * from a race where someone else frees the block (and releases
+ * its journal_head) before we are done dirtying the buffer. In
+ * nojournal mode this race is harmless and we actually cannot
+ * call ext4_handle_dirty_xattr_block() with locked buffer as
+ * that function can call sync_dirty_buffer() so for that case
+ * we handle the dirtying after unlocking the buffer.
+ */
+ if (ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
+ unlock_buffer(bh);
+ if (!ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
if (IS_SYNC(inode))
- handle->h_sync = 1;
- DQUOT_FREE_BLOCK(inode, 1);
+ ext4_handle_sync(handle);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
- if (ce)
- mb_cache_entry_release(ce);
}
- unlock_buffer(bh);
out:
ext4_std_error(inode->i_sb, error);
return;
@@ -509,31 +591,17 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- *total += EXT4_XATTR_LEN(last->e_name_len);
if (!last->e_value_block && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
}
+ if (total)
+ *total += EXT4_XATTR_LEN(last->e_name_len);
}
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
-struct ext4_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
-};
-
-struct ext4_xattr_search {
- struct ext4_xattr_entry *first;
- void *base;
- void *end;
- struct ext4_xattr_entry *here;
- int not_found;
-};
-
static int
ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
{
@@ -586,9 +654,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size. Just replace. */
s->here->e_value_size =
cpu_to_le32(i->value_len);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
return 0;
}
@@ -627,9 +700,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size_t size = EXT4_XATTR_SIZE(i->value_len);
void *val = s->base + min_offs - size;
s->here->e_value_offs = cpu_to_le16(min_offs - size);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear the pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear the pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
}
}
return 0;
@@ -659,10 +737,9 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
- if (ext4_xattr_check_block(bs->bh)) {
- ext4_error(sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ if (ext4_xattr_check_block(inode, bs->bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -693,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &bs->s;
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
#define header(x) ((struct ext4_xattr_header *)(x))
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
bs->bh->b_blocknr);
+ BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
goto cleanup;
@@ -717,14 +796,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!IS_LAST_ENTRY(s->first))
ext4_xattr_rehash(header(s->base),
s->here);
- ext4_xattr_cache_insert(bs->bh);
+ ext4_xattr_cache_insert(ext4_mb_cache,
+ bs->bh);
}
unlock_buffer(bs->bh);
if (error == -EIO)
goto bad_block;
if (!error)
- error = ext4_journal_dirty_metadata(handle,
- bs->bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ bs->bh);
if (error)
goto cleanup;
goto inserted;
@@ -732,13 +813,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- jbd2_journal_release_buffer(handle, bs->bh);
if (ce) {
mb_cache_entry_release(ce);
ce = NULL;
}
ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+ s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
@@ -750,7 +830,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
}
} else {
/* Allocate a buffer where we construct the new block. */
- s->base = kzalloc(sb->s_blocksize, GFP_KERNEL);
+ s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
/* assert(header == s->base) */
error = -ENOMEM;
if (s->base == NULL)
@@ -781,21 +861,23 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (DQUOT_ALLOC_BLOCK(inode, 1))
+ error = dquot_alloc_block(inode,
+ EXT4_C2B(EXT4_SB(sb), 1));
+ if (error)
goto cleanup;
+ BUFFER_TRACE(new_bh, "get_write_access");
error = ext4_journal_get_write_access(handle,
new_bh);
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
- BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
- le32_to_cpu(BHDR(new_bh)->h_refcount));
+ le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
- error = ext4_journal_dirty_metadata(handle,
- new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode,
+ new_bh);
if (error)
goto cleanup_dquot;
}
@@ -808,34 +890,53 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = le32_to_cpu(
- EXT4_SB(sb)->s_es->s_first_data_block) +
- (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
- EXT4_BLOCKS_PER_GROUP(sb);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
- goal, &error);
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
+ EXT4_I(inode)->i_block_group);
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ /*
+ * take i_data_sem because we will test
+ * i_delalloc_reserved_flag in ext4_mb_new_blocks
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ block = ext4_new_meta_blocks(handle, inode, goal, 0,
+ NULL, &error);
+ up_read((&EXT4_I(inode)->i_data_sem));
if (error)
goto cleanup;
- ea_idebug(inode, "creating block %d", block);
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+ ea_idebug(inode, "creating block %llu",
+ (unsigned long long)block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
+ error = -ENOMEM;
getblk_failed:
- ext4_free_blocks(handle, inode, block, 1, 1);
- error = -EIO;
+ ext4_free_blocks(handle, inode, NULL, block, 1,
+ EXT4_FREE_BLOCKS_METADATA);
goto cleanup;
}
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
unlock_buffer(new_bh);
+ error = -EIO;
goto getblk_failed;
}
memcpy(new_bh->b_data, s->base, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext4_xattr_cache_insert(new_bh);
- error = ext4_journal_dirty_metadata(handle, new_bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+ error = ext4_handle_dirty_xattr_block(handle,
+ inode, new_bh);
if (error)
goto cleanup;
}
@@ -859,26 +960,19 @@ cleanup:
return error;
cleanup_dquot:
- DQUOT_FREE_BLOCK(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
goto cleanup;
bad_block:
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
#undef header
}
-struct ext4_xattr_ibody_find {
- struct ext4_xattr_search s;
- struct ext4_iloc iloc;
-};
-
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
@@ -891,7 +985,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
error = ext4_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -906,10 +1000,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-static int
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error) {
+ if (error == -ENOSPC &&
+ ext4_has_inline_data(inode)) {
+ error = ext4_try_to_evict_inline_data(handle, inode,
+ EXT4_XATTR_LEN(strlen(i->name) +
+ EXT4_XATTR_SIZE(i->value_len)));
+ if (error)
+ return error;
+ error = ext4_xattr_ibody_find(inode, i, is);
+ if (error)
+ return error;
+ error = ext4_xattr_set_entry(i, s);
+ }
+ if (error)
+ return error;
+ }
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
@@ -923,10 +1054,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
return 0;
}
@@ -934,7 +1065,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
/*
* ext4_xattr_set_handle()
*
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode. Value
* is NULL to remove an existing extended attribute, and non-NULL to
* either replace an existing extended attribute, or create a new extended
* attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -961,6 +1092,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
struct ext4_xattr_block_find bs = {
.s = { .not_found = -ENODATA, },
};
+ unsigned long no_expand;
int error;
if (!name)
@@ -968,14 +1100,17 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (strlen(name) > 255)
return -ERANGE;
down_write(&EXT4_I(inode)->xattr_sem);
- error = ext4_get_inode_loc(inode, &is.iloc);
+ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+ error = ext4_reserve_inode_write(handle, inode, &is.iloc);
if (error)
goto cleanup;
- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
}
error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -997,9 +1132,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1011,6 +1143,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
i.value = NULL;
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else if (error == -ENOSPC) {
+ if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+ error = ext4_xattr_block_find(inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ }
error = ext4_xattr_block_set(handle, inode, &i, &bs);
if (error)
goto cleanup;
@@ -1025,7 +1162,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = ext4_current_time(inode);
if (!value)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1033,12 +1170,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
*/
is.iloc.bh = NULL;
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
}
cleanup:
brelse(is.iloc.bh);
brelse(bs.bh);
+ if (no_expand == 0)
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
@@ -1057,9 +1196,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
+ int credits = ext4_jbd2_credits_xattr(inode);
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
@@ -1117,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_xattr_block_find *bs = NULL;
char *buffer = NULL, *b_entry_name = NULL;
size_t min_offs, free;
- int total_ino, total_blk;
+ int total_ino;
void *base, *start, *end;
int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1165,10 +1305,9 @@ retry:
error = -EIO;
if (!bh)
goto cleanup;
- if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ if (ext4_xattr_check_block(inode, bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -1176,8 +1315,7 @@ retry:
first = BFIRST(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
- free = ext4_xattr_free_space(first, &min_offs, base,
- &total_blk);
+ free = ext4_xattr_free_space(first, &min_offs, base, NULL);
if (free < new_extra_isize) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
@@ -1240,6 +1378,9 @@ retry:
s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
+ kfree(is); is = NULL;
+ kfree(bs); bs = NULL;
+ brelse(bh);
goto retry;
}
error = -1;
@@ -1273,6 +1414,8 @@ retry:
/* Remove the chosen entry from the inode */
error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ if (error)
+ goto cleanup;
entry = IFIRST(header);
if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1303,6 +1446,8 @@ retry:
goto cleanup;
kfree(b_entry_name);
kfree(buffer);
+ b_entry_name = NULL;
+ buffer = NULL;
brelse(is->iloc.bh);
kfree(is);
kfree(bs);
@@ -1341,16 +1486,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
goto cleanup;
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh) {
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: block %llu read error", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
ext4_xattr_release_block(handle, inode, bh);
@@ -1380,18 +1523,18 @@ ext4_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext4_xattr_cache);
+ ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
if (!ce) {
ea_bdebug(bh, "out of memory");
return;
}
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
if (error) {
mb_cache_entry_free(ce);
if (error == -EBUSY) {
@@ -1458,13 +1601,14 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
- inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
+ hash);
while (ce) {
struct buffer_head *bh;
@@ -1475,9 +1619,8 @@ again:
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
- ext4_error(inode->i_sb, __FUNCTION__,
- "inode %lu: block %lu read error",
- inode->i_ino, (unsigned long) ce->e_block);
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
EXT4_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %lu refcount %d>=%d",
@@ -1489,7 +1632,7 @@ again:
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
}
return NULL;
}
@@ -1509,7 +1652,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
char *name = entry->e_name;
int n;
- for (n=0; n < entry->e_name_len; n++) {
+ for (n = 0; n < entry->e_name_len; n++) {
hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
*name++;
@@ -1562,21 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-init_ext4_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
{
- ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
- sizeof(struct mb_cache_entry) +
- sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
- if (!ext4_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(name, HASH_BUCKET_BITS);
}
-void
-exit_ext4_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
- if (ext4_xattr_cache)
- mb_cache_destroy(ext4_xattr_cache);
- ext4_xattr_cache = NULL;
+ if (cache)
+ mb_cache_destroy(cache);
}
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index d7f5d6a1265..29bedf5589f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,13 +21,17 @@
#define EXT4_XATTR_INDEX_TRUSTED 4
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
+#define EXT4_XATTR_INDEX_SYSTEM 7
+#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __u32 h_reserved[4]; /* zero right now */
+ __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
+ /* id = inum if refcount=1, blknum otherwise */
+ __u32 h_reserved[3]; /* zero right now */
};
struct ext4_xattr_ibody_header {
@@ -51,8 +55,8 @@ struct ext4_xattr_entry {
(((name_len) + EXT4_XATTR_ROUND + \
sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
- ( (struct ext4_xattr_entry *)( \
- (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+ ((struct ext4_xattr_entry *)( \
+ (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
@@ -63,18 +67,40 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-extern struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
+
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext4_xattr_list(struct inode *, char *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
@@ -84,78 +110,26 @@ extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
-extern int init_ext4_xattr(void);
-extern void exit_ext4_xattr(void);
-
-extern struct xattr_handler *ext4_xattr_handlers[];
-
-# else /* CONFIG_EXT4DEV_FS_XATTR */
-
-static inline int
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext4_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext4_xattr(void)
-{
- return 0;
-}
-
-static inline void
-exit_ext4_xattr(void)
-{
-}
-
-static inline int
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
- struct ext4_inode *raw_inode, handle_t *handle)
-{
- return -EOPNOTSUPP;
-}
+extern const struct xattr_handler *ext4_xattr_handlers[];
-#define ext4_xattr_handlers NULL
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+ const char *name,
+ void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
-# endif /* CONFIG_EXT4DEV_FS_XATTR */
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir);
+ struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir)
+ struct inode *dir, const struct qstr *qstr)
{
return 0;
}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index f17eaf2321b..d2a200624af 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,17 +3,17 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
#include <linux/security.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
#include "xattr.h"
static size_t
-ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
const size_t total_len = prefix_len + name_len + 1;
@@ -28,47 +28,53 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_security_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
- buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, buffer, size);
}
static int
-ext4_xattr_security_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
}
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
-struct xattr_handler ext4_xattr_security_handler = {
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext4_initxattrs, handle);
+}
+
+const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e0f05acdafe..95f1f4ab59a 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,21 +5,18 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
-ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
@@ -34,26 +31,26 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_trusted_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
- buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
}
static int
-ext4_xattr_trusted_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, value, size, flags);
}
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext4_xattr_trusted_list,
.get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 7ed3d8ebf09..0edb7611ffb 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,23 +5,20 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
-ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return 0;
if (list && total_len <= list_size) {
@@ -33,29 +30,30 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_user_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, buffer, size);
}
static int
-ext4_xattr_user_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, value, size, flags);
}
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext4_xattr_user_list,
.get = ext4_xattr_user_get,