aboutsummaryrefslogtreecommitdiff
path: root/fs/ext3
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext3')
-rw-r--r--fs/ext3/Kconfig89
-rw-r--r--fs/ext3/Makefile2
-rw-r--r--fs/ext3/acl.c416
-rw-r--r--fs/ext3/acl.h20
-rw-r--r--fs/ext3/balloc.c1178
-rw-r--r--fs/ext3/bitmap.c18
-rw-r--r--fs/ext3/dir.c474
-rw-r--r--fs/ext3/ext3.h1326
-rw-r--r--fs/ext3/ext3_jbd.c59
-rw-r--r--fs/ext3/file.c102
-rw-r--r--fs/ext3/fsync.c77
-rw-r--r--fs/ext3/hash.c94
-rw-r--r--fs/ext3/ialloc.c301
-rw-r--r--fs/ext3/inode.c2031
-rw-r--r--fs/ext3/ioctl.c237
-rw-r--r--fs/ext3/namei.c915
-rw-r--r--fs/ext3/namei.h19
-rw-r--r--fs/ext3/resize.c400
-rw-r--r--fs/ext3/super.c1900
-rw-r--r--fs/ext3/symlink.c10
-rw-r--r--fs/ext3/xattr.c221
-rw-r--r--fs/ext3/xattr.h22
-rw-r--r--fs/ext3/xattr_security.c67
-rw-r--r--fs/ext3/xattr_trusted.c32
-rw-r--r--fs/ext3/xattr_user.c38
25 files changed, 6609 insertions, 3439 deletions
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 00000000000..e8c6ba0e4a3
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,89 @@
+config EXT3_FS
+ tristate "Ext3 journalling file system support"
+ select JBD
+ help
+ This is the journalling version of the Second extended file system
+ (often called ext3), the de facto standard Linux file system
+ (method to organize files on a storage device) for hard disks.
+
+ The journalling code included in this driver means you do not have
+ to run e2fsck (file system checker) on your file systems after a
+ crash. The journal keeps track of any changes that were being made
+ at the time the system crashed, and can ensure that your file system
+ is consistent without the need for a lengthy check.
+
+ Other than adding the journal to the file system, the on-disk format
+ of ext3 is identical to ext2. It is possible to freely switch
+ between using the ext3 driver and the ext2 driver, as long as the
+ file system has been cleanly unmounted, or e2fsck is run on the file
+ system.
+
+ To add a journal on an existing ext2 file system or change the
+ behavior of ext3 file systems, you can use the tune2fs utility ("man
+ tune2fs"). To modify attributes of files and directories on ext3
+ file systems, use chattr ("man chattr"). You need to be using
+ e2fsprogs version 1.20 or later in order to create ext3 journals
+ (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ext3.
+
+config EXT3_DEFAULTS_TO_ORDERED
+ bool "Default to 'data=ordered' in ext3"
+ depends on EXT3_FS
+ default y
+ help
+ The journal mode options for ext3 have different tradeoffs
+ between when data is guaranteed to be on disk and
+ performance. The use of "data=writeback" can cause
+ unwritten data to appear in files after an system crash or
+ power failure, which can be a security issue. However,
+ "data=ordered" mode can also result in major performance
+ problems, including seconds-long delays before an fsync()
+ call returns. For details, see:
+
+ http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+ If you have been historically happy with ext3's performance,
+ data=ordered mode will be a safe choice and you should
+ answer 'y' here. If you understand the reliability and data
+ privacy issues of data=writeback and are willing to make
+ that trade off, answer 'n'.
+
+config EXT3_FS_XATTR
+ bool "Ext3 extended attributes"
+ depends on EXT3_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+ bool "Ext3 POSIX Access Control Lists"
+ depends on EXT3_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+ bool "Ext3 Security Labels"
+ depends on EXT3_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext3 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
index 704cd44a40c..e77766a8b3f 100644
--- a/fs/ext3/Makefile
+++ b/fs/ext3/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o
+ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 47a9da2dfb4..8bbaf5bcf98 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -4,13 +4,7 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -37,7 +31,7 @@ ext3_acl_from_disk(const void *value, size_t size)
return ERR_PTR(-EINVAL);
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
for (n=0; n < count; n++) {
@@ -54,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size)
case ACL_OTHER:
value = (char *)value +
sizeof(ext3_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
break;
case ACL_USER:
+ value = (char *)value + sizeof(ext3_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
case ACL_GROUP:
value = (char *)value + sizeof(ext3_acl_entry);
if ((char *)value > end)
goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
break;
default:
@@ -90,21 +91,26 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
size_t n;
*size = ext3_acl_size(acl->a_count);
- ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) +
- acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL);
+ ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
+ sizeof(ext3_acl_entry), GFP_NOFS);
if (!ext_acl)
return ERR_PTR(-ENOMEM);
ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext3_acl_header);
for (n=0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext3_acl_entry *entry = (ext3_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch(acl->a_entries[n].e_tag) {
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(ext3_acl_entry);
+ break;
case ACL_GROUP:
- entry->e_id =
- cpu_to_le32(acl->a_entries[n].e_id);
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
e += sizeof(ext3_acl_entry);
break;
@@ -126,68 +132,33 @@ fail:
return ERR_PTR(-EINVAL);
}
-static inline struct posix_acl *
-ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
- struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
-
- spin_lock(&inode->i_lock);
- if (*i_acl != EXT3_ACL_NOT_CACHED)
- acl = posix_acl_dup(*i_acl);
- spin_unlock(&inode->i_lock);
-
- return acl;
-}
-
-static inline void
-ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
- struct posix_acl *acl)
-{
- spin_lock(&inode->i_lock);
- if (*i_acl != EXT3_ACL_NOT_CACHED)
- posix_acl_release(*i_acl);
- *i_acl = posix_acl_dup(acl);
- spin_unlock(&inode->i_lock);
-}
-
/*
* Inode operation get_posix_acl().
*
* inode->i_mutex: don't care
*/
-static struct posix_acl *
+struct posix_acl *
ext3_get_acl(struct inode *inode, int type)
{
- struct ext3_inode_info *ei = EXT3_I(inode);
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- switch(type) {
- case ACL_TYPE_ACCESS:
- acl = ext3_iget_acl(inode, &ei->i_acl);
- if (acl != EXT3_ACL_NOT_CACHED)
- return acl;
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
- break;
-
- case ACL_TYPE_DEFAULT:
- acl = ext3_iget_acl(inode, &ei->i_default_acl);
- if (acl != EXT3_ACL_NOT_CACHED)
- return acl;
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
- break;
-
- default:
- return ERR_PTR(-EINVAL);
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
}
+
retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
if (retval > 0) {
- value = kmalloc(retval, GFP_KERNEL);
+ value = kmalloc(retval, GFP_NOFS);
if (!value)
return ERR_PTR(-ENOMEM);
retval = ext3_xattr_get(inode, name_index, "", value, retval);
@@ -200,17 +171,9 @@ ext3_get_acl(struct inode *inode, int type)
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl)) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext3_iset_acl(inode, &ei->i_acl, acl);
- break;
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
- case ACL_TYPE_DEFAULT:
- ext3_iset_acl(inode, &ei->i_default_acl, acl);
- break;
- }
- }
return acl;
}
@@ -220,28 +183,23 @@ ext3_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext3_new_inode
*/
static int
-ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
- struct ext3_inode_info *ei = EXT3_I(inode);
int name_index;
void *value = NULL;
- size_t size;
+ size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch(type) {
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
else {
- inode->i_mode = mode;
+ inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
if (error == 0)
acl = NULL;
@@ -258,7 +216,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
default:
return -EINVAL;
}
- if (acl) {
+ if (acl) {
value = ext3_acl_to_disk(acl, &size);
if (IS_ERR(value))
return (int)PTR_ERR(value);
@@ -268,284 +226,56 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
value, size, 0);
kfree(value);
- if (!error) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext3_iset_acl(inode, &ei->i_acl, acl);
- break;
- case ACL_TYPE_DEFAULT:
- ext3_iset_acl(inode, &ei->i_default_acl, acl);
- break;
- }
- }
- return error;
-}
-
-static int
-ext3_check_acl(struct inode *inode, int mask)
-{
- struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return error;
- }
-
- return -EAGAIN;
-}
-
-int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
- return generic_permission(inode, mask, ext3_check_acl);
-}
+ if (!error)
+ set_cached_acl(inode, type, acl);
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
-int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current->fs->umask;
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
- if (S_ISDIR(inode->i_mode)) {
- error = ext3_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_KERNEL);
- error = -ENOMEM;
- if (!clone)
- goto cleanup;
-
- mode = inode->i_mode;
- error = posix_acl_create_masq(clone, &mode);
- if (error >= 0) {
- inode->i_mode = mode;
- if (error > 0) {
- /* This is an extended ACL */
- error = ext3_set_acl(handle, inode,
- ACL_TYPE_ACCESS, clone);
- }
- }
- posix_acl_release(clone);
- }
-cleanup:
- posix_acl_release(acl);
return error;
}
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
int
-ext3_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl, *clone;
- int error;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- error = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!error) {
- handle_t *handle;
- int retries = 0;
-
- retry:
- handle = ext3_journal_start(inode,
- EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext3_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
- ext3_journal_stop(handle);
- if (error == -ENOSPC &&
- ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- }
-out:
- posix_acl_release(clone);
- return error;
-}
-
-/*
- * Extended attribute handlers
- */
-static size_t
-ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
-{
- struct posix_acl *acl;
- int error;
-
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext3_get_acl(inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext3_xattr_get_acl_access(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-
-static int
-ext3_xattr_get_acl_default(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-
-static int
-ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
- size_t size)
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
handle_t *handle;
- struct posix_acl *acl;
int error, retries = 0;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
retry:
handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
- error = ext3_set_acl(handle, inode, type, acl);
+ error = __ext3_set_acl(handle, inode, type, acl);
ext3_journal_stop(handle);
if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-static int
-ext3_xattr_set_acl_access(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-
-static int
-ext3_xattr_set_acl_default(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+/*
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
+ struct posix_acl *default_acl, *acl;
+ int error;
-struct xattr_handler ext3_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .list = ext3_xattr_list_acl_access,
- .get = ext3_xattr_get_acl_access,
- .set = ext3_xattr_set_acl_access,
-};
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
-struct xattr_handler ext3_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .list = ext3_xattr_list_acl_default,
- .get = ext3_xattr_get_acl_default,
- .set = ext3_xattr_set_acl_default,
-};
+ if (default_acl) {
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
+ }
+ return error;
+}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 92d50b53a93..ea1c69edab9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,27 +53,15 @@ static inline int ext3_acl_count(size_t size)
#ifdef CONFIG_EXT3_FS_POSIX_ACL
-/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
- if the ACL has not been cached */
-#define EXT3_ACL_NOT_CACHED ((void *)-1)
-
/* acl.c */
-extern int ext3_permission (struct inode *, int, struct nameidata *);
-extern int ext3_acl_chmod (struct inode *);
+extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
-extern int init_ext3_acl(void);
-extern void exit_ext3_acl(void);
-
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext3_permission NULL
-
-static inline int
-ext3_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext3_get_acl NULL
+#define ext3_set_acl NULL
static inline int
ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 6250fcdf14a..158b5d4ce06 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -11,15 +11,9 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/config.h>
-#include <linux/time.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include "ext3.h"
/*
* balloc.c contains the blocks allocation and deallocation routines
@@ -33,12 +27,34 @@
* The file system contains group descriptors which are located after the
* super block. Each descriptor contains the number of the bitmap block and
* the free blocks count in the block. The descriptors are loaded in memory
- * when a file system is mounted (see ext3_read_super).
+ * when a file system is mounted (see ext3_fill_super).
*/
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+ ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+ blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+ if (offsetp)
+ *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+ if (blockgrpp)
+ *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
+/**
+ * ext3_get_group_desc() -- load group descriptor from disk
+ * @sb: super block
+ * @block_group: given block group
+ * @bh: pointer to the buffer head to store the block
+ * group descriptor
+ */
struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
unsigned int block_group,
struct buffer_head ** bh)
@@ -74,9 +90,57 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
return desc + offset;
}
-/*
- * Read the bitmap for a given block_group, reading into the specified
- * slot in the superblock's bitmap cache.
+static int ext3_valid_block_bitmap(struct super_block *sb,
+ struct ext3_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh)
+{
+ ext3_grpblk_t offset;
+ ext3_grpblk_t next_zero_bit;
+ ext3_fsblk_t bitmap_blk;
+ ext3_fsblk_t group_first_block;
+
+ group_first_block = ext3_group_first_block_no(sb, block_group);
+
+ /* check whether block bitmap block number is set */
+ bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+ offset = bitmap_blk - group_first_block;
+ if (!ext3_test_bit(offset, bh->b_data))
+ /* bad block bitmap */
+ goto err_out;
+
+ /* check whether the inode bitmap block number is set */
+ bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
+ offset = bitmap_blk - group_first_block;
+ if (!ext3_test_bit(offset, bh->b_data))
+ /* bad block bitmap */
+ goto err_out;
+
+ /* check whether the inode table block number is set */
+ bitmap_blk = le32_to_cpu(desc->bg_inode_table);
+ offset = bitmap_blk - group_first_block;
+ next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
+ offset + EXT3_SB(sb)->s_itb_per_group,
+ offset);
+ if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
+ /* good bitmap for inode tables */
+ return 1;
+
+err_out:
+ ext3_error(sb, __func__,
+ "Invalid block bitmap - "
+ "block_group = %d, block = %lu",
+ block_group, bitmap_blk);
+ return 0;
+}
+
+/**
+ * read_block_bitmap()
+ * @sb: super block
+ * @block_group: given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
*
* Return buffer_head on success or NULL in case of failure.
*/
@@ -85,17 +149,37 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
{
struct ext3_group_desc * desc;
struct buffer_head * bh = NULL;
+ ext3_fsblk_t bitmap_blk;
- desc = ext3_get_group_desc (sb, block_group, NULL);
+ desc = ext3_get_group_desc(sb, block_group, NULL);
if (!desc)
- goto error_out;
- bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
- if (!bh)
- ext3_error (sb, "read_block_bitmap",
+ return NULL;
+ trace_ext3_read_block_bitmap(sb, block_group);
+ bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
+ bh = sb_getblk(sb, bitmap_blk);
+ if (unlikely(!bh)) {
+ ext3_error(sb, __func__,
"Cannot read block bitmap - "
"block_group = %d, block_bitmap = %u",
block_group, le32_to_cpu(desc->bg_block_bitmap));
-error_out:
+ return NULL;
+ }
+ if (likely(bh_uptodate_or_lock(bh)))
+ return bh;
+
+ if (bh_submit_read(bh) < 0) {
+ brelse(bh);
+ ext3_error(sb, __func__,
+ "Cannot read block bitmap - "
+ "block_group = %d, block_bitmap = %u",
+ block_group, le32_to_cpu(desc->bg_block_bitmap));
+ return NULL;
+ }
+ ext3_valid_block_bitmap(sb, desc, block_group, bh);
+ /*
+ * file system mounted not to panic on error, continue with corrupt
+ * bitmap
+ */
return bh;
}
/*
@@ -104,15 +188,22 @@ error_out:
* Operations include:
* dump, find, add, remove, is_empty, find_next_reservable_window, etc.
*
- * We use sorted double linked list for the per-filesystem reservation
- * window list. (like in vm_region).
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
*
- * Initially, we keep those small operations in the abstract functions,
- * so later if we need a better searching tree than double linked-list,
- * we could easily switch to that without changing too much
- * code.
*/
-#if 0
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root: root of per-filesystem reservation rb tree
+ * @verbose: verbose mode
+ * @fn: function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
static void __rsv_window_dump(struct rb_root *root, int verbose,
const char *fn)
{
@@ -127,10 +218,10 @@ restart:
printk("Block Allocation Reservation Windows Map (%s):\n", fn);
while (n) {
- rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
+ rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
if (verbose)
printk("reservation window 0x%p "
- "start: %d, end: %d\n",
+ "start: %lu, end: %lu\n",
rsv, rsv->rsv_start, rsv->rsv_end);
if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
printk("Bad reservation %p (start >= end)\n",
@@ -153,41 +244,59 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
- __rsv_window_dump((root), (verbose), __FUNCTION__)
+ __rsv_window_dump((root), (verbose), __func__)
#else
#define rsv_window_dump(root, verbose) do {} while (0)
#endif
+/**
+ * goal_in_my_reservation()
+ * @rsv: inode's reservation window
+ * @grp_goal: given goal block relative to the allocation block group
+ * @group: the current allocation block group
+ * @sb: filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
unsigned int group, struct super_block * sb)
{
- unsigned long group_first_block, group_last_block;
+ ext3_fsblk_t group_first_block, group_last_block;
- group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
- group * EXT3_BLOCKS_PER_GROUP(sb);
- group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+ group_first_block = ext3_group_first_block_no(sb, group);
+ group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
if ((rsv->_rsv_start > group_last_block) ||
(rsv->_rsv_end < group_first_block))
return 0;
- if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
- || (goal + group_first_block > rsv->_rsv_end)))
+ if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+ || (grp_goal + group_first_block > rsv->_rsv_end)))
return 0;
return 1;
}
-/*
+/**
+ * search_reserve_window()
+ * @rb_root: root of reservation tree
+ * @goal: target allocation block
+ *
* Find the reserved window which includes the goal, or the previous one
* if the goal is not in any window.
* Returns NULL if there are no windows or if all windows start after the goal.
*/
static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
{
struct rb_node *n = root->rb_node;
struct ext3_reserve_window_node *rsv;
@@ -218,17 +327,25 @@ search_reserve_window(struct rb_root *root, unsigned long goal)
return rsv;
}
+/**
+ * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb: super block
+ * @rsv: reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
void ext3_rsv_window_add(struct super_block *sb,
struct ext3_reserve_window_node *rsv)
{
struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
struct rb_node *node = &rsv->rsv_node;
- unsigned int start = rsv->rsv_start;
+ ext3_fsblk_t start = rsv->rsv_start;
struct rb_node ** p = &root->rb_node;
struct rb_node * parent = NULL;
struct ext3_reserve_window_node *this;
+ trace_ext3_rsv_window_add(sb, rsv);
while (*p)
{
parent = *p;
@@ -238,14 +355,25 @@ void ext3_rsv_window_add(struct super_block *sb,
p = &(*p)->rb_left;
else if (start > this->rsv_end)
p = &(*p)->rb_right;
- else
+ else {
+ rsv_window_dump(root, 1);
BUG();
+ }
}
rb_link_node(node, parent, p);
rb_insert_color(node, root);
}
+/**
+ * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb: super block
+ * @rsv: reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
static void rsv_window_remove(struct super_block *sb,
struct ext3_reserve_window_node *rsv)
{
@@ -255,15 +383,43 @@ static void rsv_window_remove(struct super_block *sb,
rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
}
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv: given reservation window to check
+ *
+ * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
{
/* a valid reservation end block could not be 0 */
- return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
+ return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
}
+
+/**
+ * ext3_init_block_alloc_info()
+ * @inode: file inode structure
+ *
+ * Allocate and initialize the reservation window structure, and
+ * link the window to the ext3 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext3 inode the first time the open file
+ * needs a new block. So, before every ext3_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext3_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
void ext3_init_block_alloc_info(struct inode *inode)
{
struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct ext3_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
@@ -273,7 +429,7 @@ void ext3_init_block_alloc_info(struct inode *inode)
rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- /*
+ /*
* if filesystem is mounted with NORESERVATION, the goal
* reservation window size is set to zero to indicate
* block reservation is off
@@ -289,6 +445,19 @@ void ext3_init_block_alloc_info(struct inode *inode)
ei->i_block_alloc_info = block_i;
}
+/**
+ * ext3_discard_reservation()
+ * @inode: inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ * ext3_release_file(): last writer close the file
+ * ext3_clear_inode(): last iput(), when nobody link to this file.
+ * ext3_truncate(): when the block indirect map is about to change.
+ *
+ */
void ext3_discard_reservation(struct inode *inode)
{
struct ext3_inode_info *ei = EXT3_I(inode);
@@ -302,28 +471,37 @@ void ext3_discard_reservation(struct inode *inode)
rsv = &block_i->rsv_window_node;
if (!rsv_is_empty(&rsv->rsv_window)) {
spin_lock(rsv_lock);
- if (!rsv_is_empty(&rsv->rsv_window))
+ if (!rsv_is_empty(&rsv->rsv_window)) {
+ trace_ext3_discard_reservation(inode, rsv);
rsv_window_remove(inode->i_sb, rsv);
+ }
spin_unlock(rsv_lock);
}
}
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks_sb() -- Free given blocks and update quota
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physical block to free
+ * @count: number of blocks to free
+ * @pdquot_freed_blocks: pointer to quota
+ */
void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
- unsigned long block, unsigned long count,
- int *pdquot_freed_blocks)
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
unsigned long block_group;
- unsigned long bit;
+ ext3_grpblk_t bit;
unsigned long i;
unsigned long overflow;
struct ext3_group_desc * desc;
struct ext3_super_block * es;
struct ext3_sb_info *sbi;
int err = 0, ret;
- unsigned group_freed;
+ ext3_grpblk_t group_freed;
*pdquot_freed_blocks = 0;
sbi = EXT3_SB(sb);
@@ -333,7 +511,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
block + count > le32_to_cpu(es->s_blocks_count)) {
ext3_error (sb, "ext3_free_blocks",
"Freeing blocks not in datazone - "
- "block = %lu, count = %lu", block, count);
+ "block = "E3FSBLK", count = %lu", block, count);
goto error_return;
}
@@ -366,11 +544,13 @@ do_more:
in_range (block, le32_to_cpu(desc->bg_inode_table),
sbi->s_itb_per_group) ||
in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group))
+ sbi->s_itb_per_group)) {
ext3_error (sb, "ext3_free_blocks",
"Freeing blocks in system zones - "
- "Block = %lu, count = %lu",
+ "Block = "E3FSBLK", count = %lu",
block, count);
+ goto error_return;
+ }
/*
* We are about to start releasing blocks in the bitmap,
@@ -407,7 +587,7 @@ do_more:
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
BUFFER_TRACE(debug_bh,
- "No commited data in bitmap");
+ "No committed data in bitmap");
BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
__brelse(debug_bh);
}
@@ -421,8 +601,8 @@ do_more:
}
/* @@@ This prevents newly-allocated data from being
* freed and then reallocated within the same
- * transaction.
- *
+ * transaction.
+ *
* Ideally we would want to allow that to happen, but to
* do so requires making journal_forget() capable of
* revoking the queued write of a data block, which
@@ -435,7 +615,7 @@ do_more:
* safe not to set the allocation bit in the committed
* bitmap, because we know that there is no outstanding
* activity on the buffer any more and so it is safe to
- * reallocate it.
+ * reallocate it.
*/
BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
J_ASSERT_BH(bitmap_bh,
@@ -452,8 +632,9 @@ do_more:
if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
bit + i, bitmap_bh->b_data)) {
jbd_unlock_bh_state(bitmap_bh);
- ext3_error(sb, __FUNCTION__,
- "bit already cleared for block %lu", block + i);
+ ext3_error(sb, __func__,
+ "bit already cleared for block "E3FSBLK,
+ block + i);
jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
@@ -463,11 +644,9 @@ do_more:
jbd_unlock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, block_group));
- desc->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
- group_freed);
+ le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, count);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -484,32 +663,38 @@ do_more:
count = overflow;
goto do_more;
}
- sb->s_dirt = 1;
+
error_return:
brelse(bitmap_bh);
ext3_std_error(sb, err);
return;
}
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ */
void ext3_free_blocks(handle_t *handle, struct inode *inode,
- unsigned long block, unsigned long count)
+ ext3_fsblk_t block, unsigned long count)
{
- struct super_block * sb;
- int dquot_freed_blocks;
+ struct super_block *sb = inode->i_sb;
+ unsigned long dquot_freed_blocks;
- sb = inode->i_sb;
- if (!sb) {
- printk ("ext3_free_blocks: nonexistent device");
- return;
- }
+ trace_ext3_free_blocks(inode, block, count);
ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
if (dquot_freed_blocks)
- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+ dquot_free_block(inode, dquot_freed_blocks);
return;
}
-/*
+/**
+ * ext3_test_allocatable()
+ * @nr: given allocation block group
+ * @bh: bufferhead contains the bitmap of the given block group
+ *
* For ext3 allocations, we must not reuse any blocks which are
* allocated in the bitmap buffer's "last committed data" copy. This
* prevents deletes from freeing up the page for reuse until we have
@@ -519,13 +704,13 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
* data would allow the old block to be overwritten before the
* transaction committed (because we force data to disk before commit).
* This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
+ * data and committing the delete.
*
* @@@ We may want to make this allocation behaviour conditional on
* data-writes at some point, and disable it for metadata allocations or
* sync-data inodes.
*/
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
{
int ret;
struct journal_head *jh = bh2jh(bh);
@@ -542,18 +727,23 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
return ret;
}
-static int
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
- int maxblocks)
+/**
+ * bitmap_search_next_usable_block()
+ * @start: the starting block (group relative) of the search
+ * @bh: bufferhead contains the block group bitmap
+ * @maxblocks: the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+ ext3_grpblk_t maxblocks)
{
- int next;
+ ext3_grpblk_t next;
struct journal_head *jh = bh2jh(bh);
- /*
- * The bitmap search --- search forward alternately through the actual
- * bitmap and the last-committed copy until we find a bit free in
- * both
- */
while (start < maxblocks) {
next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
if (next >= maxblocks)
@@ -563,35 +753,42 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
jbd_lock_bh_state(bh);
if (jh->b_committed_data)
start = ext3_find_next_zero_bit(jh->b_committed_data,
- maxblocks, next);
+ maxblocks, next);
jbd_unlock_bh_state(bh);
}
return -1;
}
-/*
- * Find an allocatable block in a bitmap. We honour both the bitmap and
+/**
+ * find_next_usable_block()
+ * @start: the starting block (group relative) to find next
+ * allocatable block in bitmap.
+ * @bh: bufferhead contains the block group bitmap
+ * @maxblocks: the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap. We honor both the bitmap and
* its last-committed copy (if that exists), and perform the "most
* appropriate allocation" algorithm of looking for a free block near
* the initial goal; then for a free byte somewhere in the bitmap; then
* for any free bit in the bitmap.
*/
-static int
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+ ext3_grpblk_t maxblocks)
{
- int here, next;
+ ext3_grpblk_t here, next;
char *p, *r;
if (start > 0) {
/*
- * The goal was occupied; search forward for a free
+ * The goal was occupied; search forward for a free
* block within the next XX blocks.
*
* end_goal is more or less random, but it has to be
* less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
* next 64-bit boundary is simple..
*/
- int end_goal = (start + 63) & ~63;
+ ext3_grpblk_t end_goal = (start + 63) & ~63;
if (end_goal > maxblocks)
end_goal = maxblocks;
here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -604,9 +801,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
if (here < 0)
here = 0;
- p = ((char *)bh->b_data) + (here >> 3);
- r = memscan(p, 0, (maxblocks - here + 7) >> 3);
- next = (r - ((char *)bh->b_data)) << 3;
+ p = bh->b_data + (here >> 3);
+ r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
+ next = (r - bh->b_data) << 3;
if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
return next;
@@ -620,7 +817,12 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
return here;
}
-/*
+/**
+ * claim_block()
+ * @lock: the spin lock for this block group
+ * @block: the free block (group relative) to allocate
+ * @bh: the buffer_head contains the block group bitmap
+ *
* We think we can allocate this block in this bitmap. Try to set the bit.
* If that succeeds then check that nobody has allocated and then freed the
* block since we saw that is was not marked in b_committed_data. If it _was_
@@ -628,7 +830,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
* zero (failure).
*/
static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
{
struct journal_head *jh = bh2jh(bh);
int ret;
@@ -646,22 +848,42 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
return ret;
}
-/*
+/**
+ * ext3_try_to_allocate()
+ * @sb: superblock
+ * @handle: handle to this transaction
+ * @group: given allocation block group
+ * @bitmap_bh: bufferhead holds the block bitmap
+ * @grp_goal: given target block within the group
+ * @count: target number of blocks to allocate
+ * @my_rsv: reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ * if there is a reservation window, only try to allocate block(s) from the
+ * file's own reservation window;
+ * Otherwise, the allocation range starts from the give goal block, ends at
+ * the block group's last block.
+ *
* If we failed to allocate the desired block then we may end up crossing to a
* new bitmap. In that case we must release write access to the old one via
* ext3_journal_release_buffer(), else we'll run out of credits.
*/
-static int
+static ext3_grpblk_t
ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
- struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv)
+ struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
+ unsigned long *count, struct ext3_reserve_window *my_rsv)
{
- int group_first_block, start, end;
+ ext3_fsblk_t group_first_block;
+ ext3_grpblk_t start, end;
+ unsigned long num = 0;
/* we do allocation within the reservation window if we have a window */
if (my_rsv) {
- group_first_block =
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
- group * EXT3_BLOCKS_PER_GROUP(sb);
+ group_first_block = ext3_group_first_block_no(sb, group);
if (my_rsv->_rsv_start >= group_first_block)
start = my_rsv->_rsv_start - group_first_block;
else
@@ -671,13 +893,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
if (end > EXT3_BLOCKS_PER_GROUP(sb))
/* reservation window crosses group boundary */
end = EXT3_BLOCKS_PER_GROUP(sb);
- if ((start <= goal) && (goal < end))
- start = goal;
+ if ((start <= grp_goal) && (grp_goal < end))
+ start = grp_goal;
else
- goal = -1;
+ grp_goal = -1;
} else {
- if (goal > 0)
- start = goal;
+ if (grp_goal > 0)
+ start = grp_goal;
else
start = 0;
end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -686,45 +908,57 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
repeat:
- if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
- goal = find_next_usable_block(start, bitmap_bh, end);
- if (goal < 0)
+ if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+ grp_goal = find_next_usable_block(start, bitmap_bh, end);
+ if (grp_goal < 0)
goto fail_access;
if (!my_rsv) {
int i;
- for (i = 0; i < 7 && goal > start &&
- ext3_test_allocatable(goal - 1,
+ for (i = 0; i < 7 && grp_goal > start &&
+ ext3_test_allocatable(grp_goal - 1,
bitmap_bh);
- i++, goal--)
+ i++, grp_goal--)
;
}
}
- start = goal;
+ start = grp_goal;
- if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+ if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+ grp_goal, bitmap_bh)) {
/*
* The block was allocated by another thread, or it was
* allocated and then freed by another thread
*/
start++;
- goal++;
+ grp_goal++;
if (start >= end)
goto fail_access;
goto repeat;
}
- return goal;
+ num++;
+ grp_goal++;
+ while (num < *count && grp_goal < end
+ && ext3_test_allocatable(grp_goal, bitmap_bh)
+ && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+ grp_goal, bitmap_bh)) {
+ num++;
+ grp_goal++;
+ }
+ *count = num;
+ return grp_goal - num;
fail_access:
+ *count = num;
return -1;
}
/**
- * find_next_reservable_window():
+ * find_next_reservable_window():
* find a reservable space within the given range.
* It does not allocate the reservation window for now:
* alloc_new_reservation() will do the work later.
*
- * @search_head: the head of the searching list;
+ * @search_head: the head of the searching list;
* This is not necessarily the list head of the whole filesystem
*
* We have both head and start_block to assist the search
@@ -732,12 +966,14 @@ fail_access:
* but we will shift to the place where start_block is,
* then start from there, when looking for a reservable space.
*
- * @size: the target new reservation window size
+ * @my_rsv: the reservation window
*
- * @group_first_block: the first block we consider to start
+ * @sb: the super block
+ *
+ * @start_block: the first block we consider to start
* the real search from
*
- * @last_block:
+ * @last_block:
* the maximum block number that our goal reservable space
* could start from. This is normally the last block in this
* group. The search will end when we found the start of next
@@ -745,21 +981,22 @@ fail_access:
* This could handle the cross boundary reservation window
* request.
*
- * basically we search from the given range, rather than the whole
- * reservation double linked list, (start_block, last_block)
- * to find a free region that is of my size and has not
- * been reserved.
+ * basically we search from the given range, rather than the whole
+ * reservation double linked list, (start_block, last_block)
+ * to find a free region that is of my size and has not
+ * been reserved.
*
*/
static int find_next_reservable_window(
struct ext3_reserve_window_node *search_head,
struct ext3_reserve_window_node *my_rsv,
- struct super_block * sb, int start_block,
- int last_block)
+ struct super_block * sb,
+ ext3_fsblk_t start_block,
+ ext3_fsblk_t last_block)
{
struct rb_node *next;
struct ext3_reserve_window_node *rsv, *prev;
- int cur;
+ ext3_fsblk_t cur;
int size = my_rsv->rsv_goal_size;
/* TODO: make the start of the reservation window byte-aligned */
@@ -787,7 +1024,7 @@ static int find_next_reservable_window(
prev = rsv;
next = rb_next(&rsv->rsv_node);
- rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node);
+ rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
/*
* Reached the last reservation, we can just append to the
@@ -800,7 +1037,7 @@ static int find_next_reservable_window(
/*
* Found a reserveable space big enough. We could
* have a reservation across the group boundary here
- */
+ */
break;
}
}
@@ -819,7 +1056,7 @@ static int find_next_reservable_window(
rsv_window_remove(sb, my_rsv);
/*
- * Let's book the whole avaliable window for now. We will check the
+ * Let's book the whole available window for now. We will check the
* disk bitmap later and then, if there are free blocks then we adjust
* the window size if it's larger than requested.
* Otherwise, we will remove this node from the tree next time
@@ -836,7 +1073,7 @@ static int find_next_reservable_window(
}
/**
- * alloc_new_reservation()--allocate a new reservation window
+ * alloc_new_reservation()--allocate a new reservation window
*
* To make a new reservation, we search part of the filesystem
* reservation list (the list that inside the group). We try to
@@ -859,12 +1096,12 @@ static int find_next_reservable_window(
*
* failed: we failed to find a reservation window in this group
*
- * @rsv: the reservation
+ * @my_rsv: the reservation window
*
- * @goal: The goal (group-relative). It is where the search for a
+ * @grp_goal: The goal (group-relative). It is where the search for a
* free reservable space should start from.
- * if we have a goal(goal >0 ), then start from there,
- * no goal(goal = -1), we start from the first block
+ * if we have a grp_goal(grp_goal >0 ), then start from there,
+ * no grp_goal(grp_goal = -1), we start from the first block
* of the group.
*
* @sb: the super block
@@ -873,26 +1110,26 @@ static int find_next_reservable_window(
*
*/
static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
- int goal, struct super_block *sb,
+ ext3_grpblk_t grp_goal, struct super_block *sb,
unsigned int group, struct buffer_head *bitmap_bh)
{
struct ext3_reserve_window_node *search_head;
- int group_first_block, group_end_block, start_block;
- int first_free_block;
+ ext3_fsblk_t group_first_block, group_end_block, start_block;
+ ext3_grpblk_t first_free_block;
struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
unsigned long size;
int ret;
spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
- group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
- group * EXT3_BLOCKS_PER_GROUP(sb);
- group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+ group_first_block = ext3_group_first_block_no(sb, group);
+ group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
- if (goal < 0)
+ if (grp_goal < 0)
start_block = group_first_block;
else
- start_block = goal + group_first_block;
+ start_block = grp_goal + group_first_block;
+ trace_ext3_alloc_new_reservation(sb, start_block);
size = my_rsv->rsv_goal_size;
if (!rsv_is_empty(&my_rsv->rsv_window)) {
@@ -918,9 +1155,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
if ((my_rsv->rsv_alloc_hit >
(my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
/*
- * if we previously allocation hit ration is greater than half
- * we double the size of reservation window next time
- * otherwise keep the same
+ * if the previously allocation hit ratio is
+ * greater than 1/2, then we double the size of
+ * the reservation window the next time,
+ * otherwise we keep the same size window
*/
size = size * 2;
if (size > EXT3_MAX_RESERVE_BLOCKS)
@@ -986,8 +1224,11 @@ retry:
* check if the first free block is within the
* free space we just reserved
*/
- if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+ if (start_block >= my_rsv->rsv_start &&
+ start_block <= my_rsv->rsv_end) {
+ trace_ext3_reserved(sb, start_block, my_rsv);
return 0; /* success */
+ }
/*
* if the first free bit we found is out of the reservable space
* continue search for next reservable space,
@@ -999,7 +1240,59 @@ retry:
goto retry;
}
-/*
+/**
+ * try_to_extend_reservation()
+ * @my_rsv: given reservation window
+ * @sb: super block
+ * @size: the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext3_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext3_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
+ struct super_block *sb, int size)
+{
+ struct ext3_reserve_window_node *next_rsv;
+ struct rb_node *next;
+ spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+
+ if (!spin_trylock(rsv_lock))
+ return;
+
+ next = rb_next(&my_rsv->rsv_node);
+
+ if (!next)
+ my_rsv->rsv_end += size;
+ else {
+ next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
+
+ if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+ my_rsv->rsv_end += size;
+ else
+ my_rsv->rsv_end = next_rsv->rsv_start - 1;
+ }
+ spin_unlock(rsv_lock);
+}
+
+/**
+ * ext3_try_to_allocate_with_rsv()
+ * @sb: superblock
+ * @handle: handle to this transaction
+ * @group: given allocation block group
+ * @bitmap_bh: bufferhead holds the block bitmap
+ * @grp_goal: given target block within the group
+ * @my_rsv: reservation window
+ * @count: target number of blocks to allocate
+ * @errp: pointer to store the error code
+ *
* This is the main function used to allocate a new block and its reservation
* window.
*
@@ -1015,20 +1308,20 @@ retry:
* reservation), and there are lots of free blocks, but they are all
* being reserved.
*
- * We use a sorted double linked list for the per-filesystem reservation list.
- * The insert, remove and find a free space(non-reserved) operations for the
- * sorted double linked list should be fast.
+ * We use a red-black tree for the per-filesystem reservation list.
*
*/
-static int
+static ext3_grpblk_t
ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
unsigned int group, struct buffer_head *bitmap_bh,
- int goal, struct ext3_reserve_window_node * my_rsv,
- int *errp)
+ ext3_grpblk_t grp_goal,
+ struct ext3_reserve_window_node * my_rsv,
+ unsigned long *count, int *errp)
{
- unsigned long group_first_block;
- int ret = 0;
+ ext3_fsblk_t group_first_block, group_last_block;
+ ext3_grpblk_t ret = 0;
int fatal;
+ unsigned long num = *count;
*errp = 0;
@@ -1051,17 +1344,18 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
* or last attempt to allocate a block with reservation turned on failed
*/
if (my_rsv == NULL ) {
- ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
+ ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+ grp_goal, count, NULL);
goto out;
}
/*
- * goal is a group relative block number (if there is a goal)
- * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+ * grp_goal is a group relative block number (if there is a goal)
+ * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
* first block is a filesystem wide block number
* first block is the block number of the first block in this group
*/
- group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
- group * EXT3_BLOCKS_PER_GROUP(sb);
+ group_first_block = ext3_group_first_block_no(sb, group);
+ group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
/*
* Basically we will allocate a new block from inode's reservation
@@ -1080,24 +1374,40 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
*/
while (1) {
if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
- !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
- ret = alloc_new_reservation(my_rsv, goal, sb,
+ !goal_in_my_reservation(&my_rsv->rsv_window,
+ grp_goal, group, sb)) {
+ if (my_rsv->rsv_goal_size < *count)
+ my_rsv->rsv_goal_size = *count;
+ ret = alloc_new_reservation(my_rsv, grp_goal, sb,
group, bitmap_bh);
if (ret < 0)
break; /* failed */
- if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
- goal = -1;
+ if (!goal_in_my_reservation(&my_rsv->rsv_window,
+ grp_goal, group, sb))
+ grp_goal = -1;
+ } else if (grp_goal >= 0) {
+ int curr = my_rsv->rsv_end -
+ (grp_goal + group_first_block) + 1;
+
+ if (curr < *count)
+ try_to_extend_reservation(my_rsv, sb,
+ *count - curr);
}
- if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
- || (my_rsv->rsv_end < group_first_block))
+
+ if ((my_rsv->rsv_start > group_last_block) ||
+ (my_rsv->rsv_end < group_first_block)) {
+ rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
BUG();
- ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
- &my_rsv->rsv_window);
+ }
+ ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+ grp_goal, &num, &my_rsv->rsv_window);
if (ret >= 0) {
- my_rsv->rsv_alloc_hit++;
+ my_rsv->rsv_alloc_hit += num;
+ *count = num;
break; /* succeed */
}
+ num = *count;
}
out:
if (ret >= 0) {
@@ -1116,29 +1426,42 @@ out:
return ret;
}
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+/**
+ * ext3_has_free_blocks()
+ * @sbi: in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
{
- int free_blocks, root_blocks;
+ ext3_fsblk_t free_blocks, root_blocks;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
+ (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+ !in_group_p (sbi->s_resgid))) {
return 0;
}
return 1;
}
-/*
+/**
+ * ext3_should_retry_alloc()
+ * @sb: super block
+ * @retries number of attemps has been made
+ *
* ext3_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
* return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
*/
int ext3_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+ if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1146,27 +1469,34 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
}
-/*
- * ext3_new_block uses a goal block to assist allocation. If the goal is
- * free, or there is a free block within 32 blocks of the goal, that block
- * is allocated. Otherwise a forward search is made for a free block; within
- * each block group the search first looks for an entire free byte in the block
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
+/**
+ * ext3_new_blocks() -- core block(s) allocation function
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: target number of blocks to allocate
+ * @errp: error code
+ *
+ * ext3_new_blocks uses a goal block to assist allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
*/
-int ext3_new_block(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gdp_bh;
int group_no;
int goal_group;
- int ret_block;
+ ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
+ ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
+ ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
int bgi; /* blockgroup iteration index */
- int target_block;
int fatal = 0, err;
int performed_allocation = 0;
- int free_blocks;
+ ext3_grpblk_t free_blocks; /* number of free blocks in a group */
struct super_block *sb;
struct ext3_group_desc *gdp;
struct ext3_super_block *es;
@@ -1178,24 +1508,24 @@ int ext3_new_block(handle_t *handle, struct inode *inode,
static int goal_hits, goal_attempts;
#endif
unsigned long ngroups;
+ unsigned long num = *count;
*errp = -ENOSPC;
sb = inode->i_sb;
- if (!sb) {
- printk("ext3_new_block: nonexistent device");
- return 0;
- }
/*
* Check quota for allocation of this block.
*/
- if (DQUOT_ALLOC_BLOCK(inode, 1)) {
- *errp = -EDQUOT;
+ err = dquot_alloc_block(inode, num);
+ if (err) {
+ *errp = err;
return 0;
}
+ trace_ext3_request_blocks(inode, goal, num);
+
sbi = EXT3_SB(sb);
- es = EXT3_SB(sb)->s_es;
+ es = sbi->s_es;
ext3_debug("goal=%lu.\n", goal);
/*
* Allocate a block from reservation only when
@@ -1209,7 +1539,7 @@ int ext3_new_block(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext3_has_free_blocks(sbi)) {
+ if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
*errp = -ENOSPC;
goto out;
}
@@ -1222,32 +1552,34 @@ int ext3_new_block(handle_t *handle, struct inode *inode,
goal = le32_to_cpu(es->s_first_data_block);
group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
EXT3_BLOCKS_PER_GROUP(sb);
+ goal_group = group_no;
+retry_alloc:
gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
if (!gdp)
goto io_error;
- goal_group = group_no;
-retry:
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
/*
* if there is not enough free blocks to make a new resevation
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
+ && (free_blocks > 0)
&& (rsv_is_empty(&my_rsv->rsv_window)))
my_rsv = NULL;
if (free_blocks > 0) {
- ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+ grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
EXT3_BLOCKS_PER_GROUP(sb));
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
- ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
- bitmap_bh, ret_block, my_rsv, &fatal);
+ grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+ group_no, bitmap_bh, grp_target_blk,
+ my_rsv, &num, &fatal);
if (fatal)
goto out;
- if (ret_block >= 0)
+ if (grp_alloc_blk >= 0)
goto allocated;
}
@@ -1255,49 +1587,58 @@ retry:
smp_rmb();
/*
- * Now search the rest of the groups. We assume that
- * i and gdp correctly point to the last group visited.
+ * Now search the rest of the groups. We assume that
+ * group_no and gdp correctly point to the last group visited.
*/
for (bgi = 0; bgi < ngroups; bgi++) {
group_no++;
if (group_no >= ngroups)
group_no = 0;
gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp) {
- *errp = -EIO;
- goto out;
- }
+ if (!gdp)
+ goto io_error;
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
/*
+ * skip this group (and avoid loading bitmap) if there
+ * are no free blocks
+ */
+ if (!free_blocks)
+ continue;
+ /*
* skip this group if the number of
* free blocks is less than half of the reservation
* window size.
*/
- if (free_blocks <= (windowsz/2))
+ if (my_rsv && (free_blocks <= (windowsz/2)))
continue;
brelse(bitmap_bh);
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
- ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
- bitmap_bh, -1, my_rsv, &fatal);
+ /*
+ * try to allocate block(s) from this group, without a goal(-1).
+ */
+ grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+ group_no, bitmap_bh, -1, my_rsv,
+ &num, &fatal);
if (fatal)
goto out;
- if (ret_block >= 0)
+ if (grp_alloc_blk >= 0)
goto allocated;
}
/*
- * We may end up a bogus ealier ENOSPC error due to
+ * We may end up a bogus earlier ENOSPC error due to
* filesystem is "full" of reservations, but
- * there maybe indeed free blocks avaliable on disk
+ * there maybe indeed free blocks available on disk
* In this case, we just forget about the reservations
* just do block allocation as without reservations.
*/
if (my_rsv) {
my_rsv = NULL;
+ windowsz = 0;
group_no = goal_group;
- goto retry;
+ goto retry_alloc;
}
/* No space left on the device */
*errp = -ENOSPC;
@@ -1313,16 +1654,24 @@ allocated:
if (fatal)
goto out;
- target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
- + le32_to_cpu(es->s_first_data_block);
+ ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
- if (target_block == le32_to_cpu(gdp->bg_block_bitmap) ||
- target_block == le32_to_cpu(gdp->bg_inode_bitmap) ||
- in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group))
+ if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+ in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+ in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group) ||
+ in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+ EXT3_SB(sb)->s_itb_per_group)) {
ext3_error(sb, "ext3_new_block",
"Allocating block in system zone - "
- "block = %u", target_block);
+ "blocks from "E3FSBLK", length %lu",
+ ret_block, num);
+ /*
+ * claim_block() marked the blocks we allocated as in use. So we
+ * may want to selectively mark some of the blocks as free.
+ */
+ goto retry_alloc;
+ }
performed_allocation = 1;
@@ -1331,7 +1680,7 @@ allocated:
struct buffer_head *debug_bh;
/* Record bitmap buffer state in the newly allocated block */
- debug_bh = sb_find_get_block(sb, target_block);
+ debug_bh = sb_find_get_block(sb, ret_block);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "state when allocated");
BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1341,23 +1690,24 @@ allocated:
jbd_lock_bh_state(bitmap_bh);
spin_lock(sb_bgl_lock(sbi, group_no));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
- if (ext3_test_bit(ret_block,
- bh2jh(bitmap_bh)->b_committed_data)) {
- printk("%s: block was unexpectedly set in "
- "b_committed_data\n", __FUNCTION__);
+ int i;
+
+ for (i = 0; i < num; i++) {
+ if (ext3_test_bit(grp_alloc_blk+i,
+ bh2jh(bitmap_bh)->b_committed_data)) {
+ printk("%s: block was unexpectedly set in "
+ "b_committed_data\n", __func__);
+ }
}
}
- ext3_debug("found bit %d\n", ret_block);
+ ext3_debug("found bit %d\n", grp_alloc_blk);
spin_unlock(sb_bgl_lock(sbi, group_no));
jbd_unlock_bh_state(bitmap_bh);
#endif
- /* ret_block was blockgroup-relative. Now it becomes fs-relative */
- ret_block = target_block;
-
- if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
+ if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
ext3_error(sb, "ext3_new_block",
- "block(%d) >= blocks count(%d) - "
+ "block("E3FSBLK") >= blocks count(%d) - "
"block_group = %d, es == %p ", ret_block,
le32_to_cpu(es->s_blocks_count), group_no, es);
goto out;
@@ -1368,26 +1718,30 @@ allocated:
* list of some description. We don't know in advance whether
* the caller wants to use it as metadata or data.
*/
- ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+ ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
ret_block, goal_hits, goal_attempts);
spin_lock(sb_bgl_lock(sbi, group_no));
- gdp->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
+ le16_add_cpu(&gdp->bg_free_blocks_count, -num);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
- sb->s_dirt = 1;
+ fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
if (fatal)
goto out;
*errp = 0;
brelse(bitmap_bh);
+
+ if (num < *count) {
+ dquot_free_block(inode, *count-num);
+ *count = num;
+ }
+
+ trace_ext3_allocate_blocks(inode, goal, num,
+ (unsigned long long)ret_block);
+
return ret_block;
io_error:
@@ -1401,20 +1755,35 @@ out:
* Undo the block allocation
*/
if (!performed_allocation)
- DQUOT_FREE_BLOCK(inode, 1);
+ dquot_free_block(inode, *count);
brelse(bitmap_bh);
return 0;
}
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp)
{
- unsigned long desc_count;
+ unsigned long count = 1;
+
+ return ext3_new_blocks(handle, inode, goal, &count, errp);
+}
+
+/**
+ * ext3_count_free_blocks() -- count filesystem free blocks
+ * @sb: superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
+{
+ ext3_fsblk_t desc_count;
struct ext3_group_desc *gdp;
int i;
unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
#ifdef EXT3FS_DEBUG
struct ext3_super_block *es;
- unsigned long bitmap_count, x;
+ ext3_fsblk_t bitmap_count;
+ unsigned long x;
struct buffer_head *bitmap_bh = NULL;
es = EXT3_SB(sb)->s_es;
@@ -1439,8 +1808,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
bitmap_count += x;
}
brelse(bitmap_bh);
- printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
- le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+ printk("ext3_count_free_blocks: stored = "E3FSBLK
+ ", computed = "E3FSBLK", "E3FSBLK"\n",
+ (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
+ desc_count, bitmap_count);
return bitmap_count;
#else
desc_count = 0;
@@ -1456,14 +1827,6 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
#endif
}
-static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
-{
- return ext3_test_bit ((block -
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
- EXT3_BLOCKS_PER_GROUP(sb), map);
-}
-
static inline int test_root(int a, int b)
{
int num = b;
@@ -1493,12 +1856,29 @@ static int ext3_group_sparse(int group)
*/
int ext3_bg_has_super(struct super_block *sb, int group)
{
- if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
- !ext3_group_sparse(group))
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+ !ext3_group_sparse(group))
return 0;
return 1;
}
+static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+ unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+ unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
+ unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
+
+ if (group == first || group == first + 1 || group == last)
+ return 1;
+ return 0;
+}
+
+static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+ return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
+}
+
/**
* ext3_bg_num_gdb - number of blocks used by the group table in group
* @sb: superblock for filesystem
@@ -1510,9 +1890,269 @@ int ext3_bg_has_super(struct super_block *sb, int group)
*/
unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
{
- if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
- !ext3_group_sparse(group))
- return 0;
- return EXT3_SB(sb)->s_gdb_count;
+ unsigned long first_meta_bg =
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
+ unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
+
+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
+ metagroup < first_meta_bg)
+ return ext3_bg_num_gdb_nometa(sb,group);
+
+ return ext3_bg_num_gdb_meta(sb,group);
+
+}
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: allocation group to trim
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @gdp: allocation group description structure
+ * @minblocks: minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+ unsigned int group,
+ ext3_grpblk_t start, ext3_grpblk_t max,
+ ext3_grpblk_t minblocks)
+{
+ handle_t *handle;
+ ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+ ext3_fsblk_t discard_block;
+ struct ext3_sb_info *sbi;
+ struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+ struct ext3_group_desc *gdp;
+ int err = 0, ret = 0;
+
+ /*
+ * We will update one block bitmap, and one group descriptor
+ */
+ handle = ext3_journal_start_sb(sb, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ bitmap_bh = read_block_bitmap(sb, group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting undo access");
+ err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (err)
+ goto err_out;
+
+ gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto err_out;
+
+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+ sbi = EXT3_SB(sb);
+
+ /* Walk through the whole group */
+ while (start <= max) {
+ start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+ if (start < 0)
+ break;
+ next = start;
+
+ /*
+ * Allocate contiguous free extents by setting bits in the
+ * block bitmap
+ */
+ while (next <= max
+ && claim_block(sb_bgl_lock(sbi, group),
+ next, bitmap_bh)) {
+ next++;
+ }
+
+ /* We did not claim any blocks */
+ if (next == start)
+ continue;
+
+ discard_block = (ext3_fsblk_t)start +
+ ext3_group_first_block_no(sb, group);
+
+ /* Update counters */
+ spin_lock(sb_bgl_lock(sbi, group));
+ le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+ spin_unlock(sb_bgl_lock(sbi, group));
+ percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+ free_blocks -= next - start;
+ /* Do not issue a TRIM on extents smaller than minblocks */
+ if ((next - start) < minblocks)
+ goto free_extent;
+
+ trace_ext3_discard_blocks(sb, discard_block, next - start);
+ /* Send the TRIM command down to the device */
+ err = sb_issue_discard(sb, discard_block, next - start,
+ GFP_NOFS, 0);
+ count += (next - start);
+free_extent:
+ freed = 0;
+
+ /*
+ * Clear bits in the bitmap
+ */
+ for (bit = start; bit < next; bit++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+ bit, bitmap_bh->b_data)) {
+ ext3_error(sb, __func__,
+ "bit already cleared for block "E3FSBLK,
+ (unsigned long)bit);
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ freed++;
+ }
+ }
+
+ /* Update couters */
+ spin_lock(sb_bgl_lock(sbi, group));
+ le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+ spin_unlock(sb_bgl_lock(sbi, group));
+ percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+ start = next;
+ if (err < 0) {
+ if (err != -EOPNOTSUPP)
+ ext3_warning(sb, __func__, "Discard command "
+ "returned error %d\n", err);
+ break;
+ }
+
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+
+ /* No more suitable extents */
+ if (free_blocks < minblocks)
+ break;
+ }
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ if (!err)
+ err = ret;
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+ ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+ if (!err)
+ err = ret;
+
+ ext3_debug("trimmed %d blocks in the group %d\n",
+ count, group);
+
+err_out:
+ if (err)
+ count = err;
+ ext3_journal_stop(handle);
+ brelse(bitmap_bh);
+
+ return count;
}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb: superblock for filesystem
+ * @start: First Byte to trim
+ * @len: number of Bytes to trim from start
+ * @minlen: minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ ext3_grpblk_t last_block, first_block;
+ unsigned long group, first_group, last_group;
+ struct ext3_group_desc *gdp;
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ uint64_t start, minlen, end, trimmed = 0;
+ ext3_fsblk_t first_data_blk =
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+ ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+ int ret = 0;
+
+ start = range->start >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = range->minlen >> sb->s_blocksize_bits;
+
+ if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
+ return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
+
+ smp_rmb();
+
+ /* Determine first and last group to examine based on start and len */
+ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+ &first_group, &first_block);
+ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
+ &last_group, &last_block);
+
+ /* end now represents the last block to discard in this group */
+ end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
+
+ for (group = first_group; group <= last_group; group++) {
+ gdp = ext3_get_group_desc(sb, group, NULL);
+ if (!gdp)
+ break;
+
+ /*
+ * For all the groups except the last one, last block will
+ * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_block is
+ * already computed earlier by ext3_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_block;
+
+ if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
+ ret = ext3_trim_all_free(sb, group, first_block,
+ end, minlen);
+ if (ret < 0)
+ break;
+ trimmed += ret;
+ }
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first block to discard will be block #0.
+ */
+ first_block = 0;
+ }
+
+ if (ret > 0)
+ ret = 0;
+
+out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index cb16b4c5d5d..ef9c643e8e9 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -7,25 +7,13 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#ifdef EXT3FS_DEBUG
-
-#include <linux/buffer_head.h>
-
-#include "ext3_fs.h"
+#include "ext3.h"
-static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+#ifdef EXT3FS_DEBUG
unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
{
- unsigned int i;
- unsigned long sum = 0;
-
- if (!map)
- return (0);
- for (i = 0; i < numchars; i++)
- sum += nibblemap[map->b_data[i] & 0xf] +
- nibblemap[(map->b_data[i] >> 4) & 0xf];
- return (sum);
+ return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
}
#endif /* EXT3FS_DEBUG */
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 832867aef3d..17742eed2c1 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -21,35 +21,14 @@
*
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
+#include <linux/compat.h>
+#include "ext3.h"
static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext3_readdir(struct file *, void *, filldir_t);
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
-static int ext3_release_dir (struct inode * inode,
- struct file * filp);
-
-struct file_operations ext3_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .readdir = ext3_readdir, /* we take BKL. needed?*/
- .ioctl = ext3_ioctl, /* BKL held */
- .fsync = ext3_sync_file, /* BKL held */
-#ifdef CONFIG_EXT3_INDEX
- .release = ext3_release_dir,
-#endif
-};
-
+static int ext3_dx_readdir(struct file *, struct dir_context *);
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -59,7 +38,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
return (ext3_filetype_table[filetype]);
}
-
+
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+ ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ return 1;
+
+ return 0;
+}
int ext3_check_dir_entry (const char * function, struct inode * dir,
struct ext3_dir_entry_2 * de,
@@ -67,105 +65,97 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
unsigned long offset)
{
const char * error_msg = NULL;
- const int rlen = le16_to_cpu(de->rec_len);
+ const int rlen = ext3_rec_len_from_disk(de->rec_len);
- if (rlen < EXT3_DIR_REC_LEN(1))
+ if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
error_msg = "directory entry across blocks";
- else if (le32_to_cpu(de->inode) >
- le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+ else if (unlikely(le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
- if (error_msg != NULL)
+ if (unlikely(error_msg != NULL))
ext3_error (dir->i_sb, function,
"bad entry in directory #%lu: %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
dir->i_ino, error_msg, offset,
(unsigned long) le32_to_cpu(de->inode),
rlen, de->name_len);
+
return error_msg == NULL ? 1 : 0;
}
-static int ext3_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
- unsigned long offset, blk;
- int i, num, stored;
- struct buffer_head * bh, * tmp, * bha[16];
- struct ext3_dir_entry_2 * de;
- struct super_block * sb;
+ unsigned long offset;
+ int i;
+ struct ext3_dir_entry_2 *de;
int err;
- struct inode *inode = filp->f_dentry->d_inode;
- int ret = 0;
-
- sb = inode->i_sb;
-
-#ifdef CONFIG_EXT3_INDEX
- if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT3_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
- err = ext3_dx_readdir(filp, dirent, filldir);
- if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
- }
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ int dir_has_error = 0;
+
+ if (is_dx_dir(inode)) {
+ err = ext3_dx_readdir(file, ctx);
+ if (err != ERR_BAD_DX_DIR)
+ return err;
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
}
-#endif
- stored = 0;
- bh = NULL;
- offset = filp->f_pos & (sb->s_blocksize - 1);
-
- while (!error && !stored && filp->f_pos < inode->i_size) {
- blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
- bh = ext3_bread(NULL, inode, blk, 0, &err);
- if (!bh) {
- ext3_error (sb, "ext3_readdir",
- "directory #%lu contains a hole at offset %lu",
- inode->i_ino, (unsigned long)filp->f_pos);
- filp->f_pos += sb->s_blocksize - offset;
- continue;
+ offset = ctx->pos & (sb->s_blocksize - 1);
+
+ while (ctx->pos < inode->i_size) {
+ unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
+ struct buffer_head map_bh;
+ struct buffer_head *bh = NULL;
+
+ map_bh.b_state = 0;
+ err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
+ if (err > 0) {
+ pgoff_t index = map_bh.b_blocknr >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ if (!ra_has_index(&file->f_ra, index))
+ page_cache_sync_readahead(
+ sb->s_bdev->bd_inode->i_mapping,
+ &file->f_ra, file,
+ index, 1);
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ bh = ext3_bread(NULL, inode, blk, 0, &err);
}
/*
- * Do the readahead
+ * We ignore I/O errors on directories so users have a chance
+ * of recovering data when there's a bad sector
*/
- if (!offset) {
- for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
- i > 0; i--) {
- tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
- if (tmp && !buffer_uptodate(tmp) &&
- !buffer_locked(tmp))
- bha[num++] = tmp;
- else
- brelse (tmp);
- }
- if (num) {
- ll_rw_block (READA, num, bha);
- for (i = 0; i < num; i++)
- brelse (bha[i]);
+ if (!bh) {
+ if (!dir_has_error) {
+ ext3_error(sb, __func__, "directory #%lu "
+ "contains a hole at offset %lld",
+ inode->i_ino, ctx->pos);
+ dir_has_error = 1;
}
+ /* corrupt size? Maybe no more blocks to read */
+ if (ctx->pos > inode->i_blocks << 9)
+ break;
+ ctx->pos += sb->s_blocksize - offset;
+ continue;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (offset && file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
- de = (struct ext3_dir_entry_2 *)
+ de = (struct ext3_dir_entry_2 *)
(bh->b_data + i);
/* It's too expensive to do a full
* dirent test each time round this
@@ -173,78 +163,130 @@ revalidate:
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
- if (le16_to_cpu(de->rec_len) <
+ if (ext3_rec_len_from_disk(de->rec_len) <
EXT3_DIR_REC_LEN(1))
break;
- i += le16_to_cpu(de->rec_len);
+ i += ext3_rec_len_from_disk(de->rec_len);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
bh, offset)) {
- /* On error, skip the f_pos to the
+ /* On error, skip the to the
next block. */
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
- ret = stored;
- goto out;
+ break;
}
- offset += le16_to_cpu(de->rec_len);
+ offset += ext3_rec_len_from_disk(de->rec_len);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += ext3_rec_len_from_disk(de->rec_len);
}
offset = 0;
brelse (bh);
+ if (ctx->pos < inode->i_size)
+ if (!dir_relax(inode))
+ return 0;
}
-out:
- return ret;
+ return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
}
-#ifdef CONFIG_EXT3_INDEX
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
- *
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static inline loff_t ext3_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT3_HTREE_EOF_32BIT;
+ else
+ return EXT3_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond s_maxbytes,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
+ */
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int dx_dir = is_dx_dir(inode);
+ loff_t htree_max = ext3_get_htree_eof(file);
+
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, whence,
+ htree_max, htree_max);
+ else
+ return generic_file_llseek(file, offset, whence);
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -253,7 +295,7 @@ out:
struct fname {
__u32 hash;
__u32 minor_hash;
- struct rb_node rb_hash;
+ struct rb_node rb_hash;
struct fname *next;
__u32 inode;
__u8 name_len;
@@ -267,59 +309,28 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if ((n)->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = n->rb_parent;
- fname = rb_entry(n, struct fname, rb_hash);
- while (fname) {
- struct fname * old = fname;
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+ do {
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
- }
- if (!parent)
- root->rb_node = NULL;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
- root->rb_node = NULL;
-}
+ kfree(old);
+ } while (fname);
+ *root = RB_ROOT;
+}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -346,10 +357,9 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
/* Create and allocate the fname structure */
len = sizeof(struct fname) + dirent->name_len + 1;
- new_fn = kmalloc(len, GFP_KERNEL);
+ new_fn = kzalloc(len, GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
- memset(new_fn, 0, len);
new_fn->hash = hash;
new_fn->minor_hash = minor_hash;
new_fn->inode = le32_to_cpu(dirent->inode);
@@ -395,93 +405,86 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
- filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = filp->f_dentry->d_inode;
- struct super_block * sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
printk("call_filldir: called with null fname?!?\n");
- return 0;
+ return true;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name, fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
- info->extra_fname = fname->next;
- return error;
+ get_dtype(sb, fname->file_type))) {
+ info->extra_fname = fname;
+ return false;
}
fname = fname->next;
}
- return 0;
+ return true;
}
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext3_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == EXT3_HTREE_EOF)
+ if (ctx->pos == ext3_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
* If there are any leftover names on the hash collision
* chain, return them first.
*/
- if (info->extra_fname &&
- call_filldir(filp, dirent, filldir, info->extra_fname))
- goto finished;
-
- if (!info->curr_node)
+ if (info->extra_fname) {
+ if (!call_filldir(file, ctx, info->extra_fname))
+ goto finished;
+ info->extra_fname = NULL;
+ goto next_node;
+ } else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
while (1) {
/*
* Fill the rbtree if we have no more entries,
* or the inode has changed since we last read in the
- * cached entries.
+ * cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext3_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext3_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -490,13 +493,18 @@ static int ext3_dx_readdir(struct file * filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (!call_filldir(file, ctx, fname))
break;
-
+ next_node:
info->curr_node = rb_next(info->curr_node);
- if (!info->curr_node) {
+ if (info->curr_node) {
+ fname = rb_entry(info->curr_node, struct fname,
+ rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ } else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -504,7 +512,7 @@ static int ext3_dx_readdir(struct file * filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -516,4 +524,14 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
return 0;
}
+const struct file_operations ext3_dir_operations = {
+ .llseek = ext3_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = ext3_readdir,
+ .unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext3_compat_ioctl,
#endif
+ .fsync = ext3_sync_file,
+ .release = ext3_release_dir,
+};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
new file mode 100644
index 00000000000..e85ff15a060
--- /dev/null
+++ b/fs/ext3/ext3.h
@@ -0,0 +1,1326 @@
+/*
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/magic.h>
+#include <linux/bug.h>
+#include <linux/blockgroup_lock.h>
+
+/*
+ * The second extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT3FS_DEBUG to produce debug messages
+ */
+#undef EXT3FS_DEBUG
+
+/*
+ * Define EXT3_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT3_DEFAULT_RESERVE_BLOCKS 8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT3_MAX_RESERVE_BLOCKS 1027
+#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
+
+/*
+ * Debug code
+ */
+#ifdef EXT3FS_DEBUG
+#define ext3_debug(f, a...) \
+ do { \
+ printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk (KERN_DEBUG f, ## a); \
+ } while (0)
+#else
+#define ext3_debug(f, a...) do {} while (0)
+#endif
+
+/*
+ * Special inodes numbers
+ */
+#define EXT3_BAD_INO 1 /* Bad blocks inode */
+#define EXT3_ROOT_INO 2 /* Root inode */
+#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+#define EXT3_JOURNAL_INO 8 /* Journal inode */
+
+/* First non-reserved inode for old ext3 filesystems */
+#define EXT3_GOOD_OLD_FIRST_INO 11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT3_LINK_MAX 32000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT3_MIN_BLOCK_SIZE 1024
+#define EXT3_MAX_BLOCK_SIZE 65536
+#define EXT3_MIN_BLOCK_LOG_SIZE 10
+#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
+#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
+#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
+#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT3_MIN_FRAG_SIZE 1024
+#define EXT3_MAX_FRAG_SIZE 4096
+#define EXT3_MIN_FRAG_LOG_SIZE 10
+#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
+#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext3_group_desc
+{
+ __le32 bg_block_bitmap; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap; /* Inodes bitmap block */
+ __le32 bg_inode_table; /* Inodes table block */
+ __le16 bg_free_blocks_count; /* Free blocks count */
+ __le16 bg_free_inodes_count; /* Free inodes count */
+ __le16 bg_used_dirs_count; /* Directories count */
+ __u16 bg_pad;
+ __le32 bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
+#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
+#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
+#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT3_NDIR_BLOCKS 12
+#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
+#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
+#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
+#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
+#define EXT3_UNRM_FL 0x00000002 /* Undelete */
+#define EXT3_COMPR_FL 0x00000004 /* Compress file */
+#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
+#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
+#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
+#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT3_DIRTY_FL 0x00000100
+#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
+#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+ EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+ EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
+ EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT3_REG_FLMASK;
+ else
+ return flags & EXT3_OTHER_FLMASK;
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext3_new_group_input {
+ __u32 group; /* Group number for this data */
+ __u32 block_bitmap; /* Absolute block number of block bitmap */
+ __u32 inode_bitmap; /* Absolute block number of inode bitmap */
+ __u32 inode_table; /* Absolute block number of inode table start */
+ __u32 blocks_count; /* Total number of blocks in this group */
+ __u16 reserved_blocks; /* Number of reserved blocks in this group */
+ __u16 unused;
+};
+
+/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
+struct ext3_new_group_data {
+ __u32 group;
+ __u32 block_bitmap;
+ __u32 inode_bitmap;
+ __u32 inode_table;
+ __u32 blocks_count;
+ __u16 reserved_blocks;
+ __u16 unused;
+ __u32 free_blocks_count;
+};
+
+
+/*
+ * ioctl commands
+ */
+#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
+#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
+#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+#endif
+#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
+#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
+#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
+#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
+#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
+#endif
+#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+
+
+/*
+ * Mount options
+ */
+struct ext3_mount_options {
+ unsigned long s_mount_opt;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext3_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __u32 l_i_reserved1;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl; /* File ACL */
+ __le32 i_dir_acl; /* Directory ACL */
+ __le32 i_faddr; /* Fragment address */
+ union {
+ struct {
+ __u8 l_i_frag; /* Fragment number */
+ __u8 l_i_fsize; /* Fragment size */
+ __u16 i_pad1;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __u8 h_i_frag; /* Fragment number */
+ __u8 h_i_fsize; /* Fragment size */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __u8 m_i_frag; /* Fragment number */
+ __u8 m_i_fsize; /* Fragment size */
+ __u16 m_pad1;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ __le16 i_extra_isize;
+ __le16 i_pad1;
+};
+
+#define i_size_high i_dir_acl
+
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_frag osd2.linux2.l_i_frag
+#define i_fsize osd2.linux2.l_i_fsize
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_reserved2 osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT3_ERROR_FS 0x0002 /* Errors detected */
+#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
+/* EXT3_MOUNT_OLDALLOC was there */
+#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
+#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
+#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
+#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
+#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
+#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
+#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
+#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
+#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
+#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
+ * error in ordered mode */
+
+/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+#ifndef _LINUX_EXT2_FS_H
+#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
+#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
+#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
+ EXT3_MOUNT_##opt)
+#else
+#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
+#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
+#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
+#endif
+
+#define ext3_set_bit __set_bit_le
+#define ext3_set_bit_atomic ext2_set_bit_atomic
+#define ext3_clear_bit __clear_bit_le
+#define ext3_clear_bit_atomic ext2_clear_bit_atomic
+#define ext3_test_bit test_bit_le
+#define ext3_find_next_zero_bit find_next_zero_bit_le
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT3_ERRORS_PANIC 3 /* Panic */
+#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext3_super_block {
+/*00*/ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count; /* Blocks count */
+ __le32 s_r_blocks_count; /* Reserved blocks count */
+ __le32 s_free_blocks_count; /* Free blocks count */
+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_frag_size; /* Fragment size */
+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_frags_per_group; /* # Fragments per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+/*30*/ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+/*40*/ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT3_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+/*78*/ char s_volume_name[16]; /* volume name */
+/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
+ /*
+ * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
+ __le32 s_journal_dev; /* device number of journal file */
+ __le32 s_last_orphan; /* start of list of inodes to delete */
+ __le32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_word_pad;
+ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __le32 s_mkfs_time; /* When the filesystem was created */
+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
+ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
+ __le32 s_r_blocks_count_hi; /* Reserved blocks count */
+ __le32 s_free_blocks_count_hi; /* Free blocks count */
+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+ __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
+};
+
+/* data type for block offset of block group */
+typedef int ext3_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext3_fsblk_t;
+
+#define E3FSBLK "%lu"
+
+struct ext3_reserve_window {
+ ext3_fsblk_t _rsv_start; /* First byte reserved */
+ ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
+};
+
+struct ext3_reserve_window_node {
+ struct rb_node rsv_node;
+ __u32 rsv_goal_size;
+ __u32 rsv_alloc_hit;
+ struct ext3_reserve_window rsv_window;
+};
+
+struct ext3_block_alloc_info {
+ /* information about reservation window */
+ struct ext3_reserve_window_node rsv_window_node;
+ /*
+ * was i_next_alloc_block in ext3_inode_info
+ * is the logical (file-relative) number of the
+ * most-recently-allocated block in this file.
+ * We use this for detecting linearly ascending allocation requests.
+ */
+ __u32 last_alloc_logical_block;
+ /*
+ * Was i_next_alloc_goal in ext3_inode_info
+ * is the *physical* companion to i_next_alloc_block.
+ * it the physical block number of the block which was most-recentl
+ * allocated to this file. This give us the goal (target) for the next
+ * allocation when we detect linearly ascending requests.
+ */
+ ext3_fsblk_t last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * third extended file system inode data in memory
+ */
+struct ext3_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_flags;
+#ifdef EXT3_FRAGMENTS
+ __u32 i_faddr;
+ __u8 i_frag_no;
+ __u8 i_frag_size;
+#endif
+ ext3_fsblk_t i_file_acl;
+ __u32 i_dir_acl;
+ __u32 i_dtime;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ __u32 i_block_group;
+ unsigned long i_state_flags; /* Dynamic state flags for ext3 */
+
+ /* block reservation info */
+ struct ext3_block_alloc_info *i_block_alloc_info;
+
+ __u32 i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+#endif
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext3_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+ /*
+ * truncate_mutex is for serialising ext3_truncate() against
+ * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext3 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have truncate_mutex.
+ */
+ struct mutex truncate_mutex;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ atomic_t i_sync_tid;
+ atomic_t i_datasync_tid;
+
+ struct inode vfs_inode;
+};
+
+/*
+ * third extended-fs super-block data in memory
+ */
+struct ext3_sb_info {
+ unsigned long s_frag_size; /* Size of a fragment in bytes */
+ unsigned long s_frags_per_block;/* Number of fragments per block */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_frags_per_group;/* Number of fragments in a group */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ unsigned long s_groups_count; /* Number of groups in the fs */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head ** s_group_desc;
+ unsigned long s_mount_opt;
+ ext3_fsblk_t s_sb_block;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+
+ /* root of the per fs reservation window tree */
+ spinlock_t s_rsv_window_lock;
+ struct rb_root s_rsv_window_root;
+ struct ext3_reserve_window_node s_rsv_window_head;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+ struct journal_s * s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ struct mutex s_resize_lock;
+ unsigned long s_commit_interval;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
+{
+ return container_of(inode, struct ext3_inode_info, vfs_inode);
+}
+
+static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
+{
+ return ino == EXT3_ROOT_INO ||
+ ino == EXT3_JOURNAL_INO ||
+ ino == EXT3_RESIZE_INO ||
+ (ino >= EXT3_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT3_STATE_JDATA, /* journaled data exists */
+ EXT3_STATE_NEW, /* inode is newly created */
+ EXT3_STATE_XATTR, /* has in-inode xattrs */
+ EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
+};
+
+static inline int ext3_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT3_OS_LINUX 0
+#define EXT3_OS_HURD 1
+#define EXT3_OS_MASIX 2
+#define EXT3_OS_FREEBSD 3
+#define EXT3_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
+#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
+
+#define EXT3_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+
+#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+ EXT3_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT3_DEF_RESUID 0
+#define EXT3_DEF_RESGID 0
+
+/*
+ * Default mount options
+ */
+#define EXT3_DEFM_DEBUG 0x0001
+#define EXT3_DEFM_BSDGROUPS 0x0002
+#define EXT3_DEFM_XATTR_USER 0x0004
+#define EXT3_DEFM_ACL 0x0008
+#define EXT3_DEFM_UID16 0x0010
+#define EXT3_DEFM_JMODE 0x0060
+#define EXT3_DEFM_JMODE_DATA 0x0020
+#define EXT3_DEFM_JMODE_ORDERED 0x0040
+#define EXT3_DEFM_JMODE_WBACK 0x0060
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT3_NAME_LEN 255
+
+struct ext3_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * The new version of the directory entry. Since EXT3 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext3_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * Ext3 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+#define EXT3_FT_UNKNOWN 0
+#define EXT3_FT_REG_FILE 1
+#define EXT3_FT_DIR 2
+#define EXT3_FT_CHRDEV 3
+#define EXT3_FT_BLKDEV 4
+#define EXT3_FT_FIFO 5
+#define EXT3_FT_SOCK 6
+#define EXT3_FT_SYMLINK 7
+
+#define EXT3_FT_MAX 8
+
+/*
+ * EXT3_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT3_DIR_PAD 4
+#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
+#define EXT3_MAX_REC_LEN ((1<<16)-1)
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == EXT3_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 ext3_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(EXT3_MAX_REC_LEN);
+ else if (len > (1 << 16))
+ BUG();
+#endif
+ return cpu_to_le16(len);
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY 0
+#define DX_HASH_HALF_MD4 1
+#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+ u32 hash;
+ u32 minor_hash;
+ int hash_version;
+ u32 *seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext3_htree_next_block
+ */
+#define HASH_NB_ALWAYS 1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext3_iloc
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ unsigned long block_group;
+};
+
+static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
+{
+ return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories. It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+ struct rb_root root;
+ struct rb_node *curr_node;
+ struct fname *extra_fname;
+ loff_t last_pos;
+ __u32 curr_hash;
+ __u32 curr_minor_hash;
+ __u32 next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext3_fsblk_t
+ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+ return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR -75000
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext3 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE /**/
+# define ATTRIB_NORET __attribute__((noreturn))
+# define NORET_AND noreturn,
+
+/* balloc.c */
+extern int ext3_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp);
+extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t block, unsigned long count);
+extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks);
+extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
+extern void ext3_check_blocks_bitmap (struct super_block *);
+extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh);
+extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext3_init_block_alloc_info(struct inode *);
+extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
+
+/* dir.c */
+extern int ext3_check_dir_entry(const char *, struct inode *,
+ struct ext3_dir_entry_2 *,
+ struct buffer_head *, unsigned long);
+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext3_dir_entry_2 *dirent);
+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext3fs_dirhash(const char *name, int len, struct
+ dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode * ext3_new_inode (handle_t *, struct inode *,
+ const struct qstr *, umode_t);
+extern void ext3_free_inode (handle_t *, struct inode *);
+extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+extern unsigned long ext3_count_free_inodes (struct super_block *);
+extern unsigned long ext3_count_dirs (struct super_block *);
+extern void ext3_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+
+/* inode.c */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext3_fsblk_t blocknr);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+ sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
+ int create);
+
+extern struct inode *ext3_iget(struct super_block *, unsigned long);
+extern int ext3_write_inode (struct inode *, struct writeback_control *);
+extern int ext3_setattr (struct dentry *, struct iattr *);
+extern void ext3_evict_inode (struct inode *);
+extern int ext3_sync_inode (handle_t *, struct inode *);
+extern void ext3_discard_reservation (struct inode *);
+extern void ext3_dirty_inode(struct inode *, int);
+extern int ext3_change_inode_journal_flag(struct inode *, int);
+extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
+extern int ext3_can_truncate(struct inode *inode);
+extern void ext3_truncate(struct inode *inode);
+extern void ext3_set_inode_flags(struct inode *);
+extern void ext3_get_inode_flags(struct ext3_inode_info *);
+extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+extern int ext3_orphan_add(handle_t *, struct inode *);
+extern int ext3_orphan_del(handle_t *, struct inode *);
+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext3_group_add(struct super_block *sb,
+ struct ext3_new_group_data *input);
+extern int ext3_group_extend(struct super_block *sb,
+ struct ext3_super_block *es,
+ ext3_fsblk_t n_blocks_count);
+
+/* super.c */
+extern __printf(3, 4)
+void ext3_error(struct super_block *, const char *, const char *, ...);
+extern void __ext3_std_error (struct super_block *, const char *, int);
+extern __printf(3, 4)
+void ext3_abort(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_msg(struct super_block *, const char *, const char *, ...);
+extern void ext3_update_dynamic_rev (struct super_block *sb);
+
+#define ext3_std_error(sb, errno) \
+do { \
+ if ((errno)) \
+ __ext3_std_error((sb), __func__, (errno)); \
+} while (0)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext3_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext3_file_inode_operations;
+extern const struct file_operations ext3_file_operations;
+
+/* namei.c */
+extern const struct inode_operations ext3_dir_inode_operations;
+extern const struct inode_operations ext3_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext3_symlink_inode_operations;
+extern const struct inode_operations ext3_fast_symlink_inode_operations;
+
+#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction. */
+
+#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT3_XATTR_TRANS_BLOCKS 6U
+
+/* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
+ EXT3_XATTR_TRANS_BLOCKS - 2 + \
+ EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+ * generous. We can grow the delete transaction later if necessary. */
+
+#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT3_MAX_TRANS_DATA 64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+#define EXT3_RESERVE_TRANS_BLOCKS 12U
+
+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only inode+data */
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+
+int
+ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext3_iloc *iloc);
+
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext3 calls into JBD. The intent here is
+ * to allow these to be turned into appropriate stubs so ext3 can control
+ * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
+ * been done yet.
+ */
+
+static inline void ext3_journal_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ journal_release_buffer(handle, bh);
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+ struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+ unsigned long blocknr, struct buffer_head *bh);
+
+int __ext3_journal_get_create_access(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext3_journal_dirty_metadata(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+#define ext3_journal_get_undo_access(handle, bh) \
+ __ext3_journal_get_undo_access(__func__, (handle), (bh))
+#define ext3_journal_get_write_access(handle, bh) \
+ __ext3_journal_get_write_access(__func__, (handle), (bh))
+#define ext3_journal_revoke(handle, blocknr, bh) \
+ __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext3_journal_get_create_access(handle, bh) \
+ __ext3_journal_get_create_access(__func__, (handle), (bh))
+#define ext3_journal_dirty_metadata(handle, bh) \
+ __ext3_journal_dirty_metadata(__func__, (handle), (bh))
+#define ext3_journal_forget(handle, bh) \
+ __ext3_journal_forget(__func__, (handle), (bh))
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext3_journal_stop(const char *where, handle_t *handle);
+
+static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+{
+ return ext3_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext3_journal_stop(handle) \
+ __ext3_journal_stop(__func__, (handle))
+
+static inline handle_t *ext3_journal_current_handle(void)
+{
+ return journal_current_handle();
+}
+
+static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+{
+ return journal_extend(handle, nblocks);
+}
+
+static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+{
+ return journal_restart(handle, nblocks);
+}
+
+static inline int ext3_journal_blocks_per_page(struct inode *inode)
+{
+ return journal_blocks_per_page(inode);
+}
+
+static inline int ext3_journal_force_commit(journal_t *journal)
+{
+ return journal_force_commit(journal);
+}
+
+/* super.c */
+int ext3_force_commit(struct super_block *sb);
+
+static inline int ext3_should_journal_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 1;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ return 1;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_order_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_writeback_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
+ return 1;
+ return 0;
+}
+
+#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
new file mode 100644
index 00000000000..785a3261a26
--- /dev/null
+++ b/fs/ext3/ext3_jbd.c
@@ -0,0 +1,59 @@
+/*
+ * Interface between ext3 and JBD
+ */
+
+#include "ext3.h"
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+ struct buffer_head *bh)
+{
+ int err = journal_get_undo_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+ struct buffer_head *bh)
+{
+ int err = journal_get_write_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh)
+{
+ int err = journal_forget(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+ unsigned long blocknr, struct buffer_head *bh)
+{
+ int err = journal_revoke(handle, blocknr, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
+
+int __ext3_journal_get_create_access(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+{
+ int err = journal_get_create_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
+
+int __ext3_journal_dirty_metadata(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+{
+ int err = journal_dirty_metadata(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __func__, bh, handle,err);
+ return err;
+}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 98e78345ead..a062fa1e1b1 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -18,11 +18,8 @@
* (jj@sunsite.ms.mff.cuni.cz)
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
+#include <linux/quotaops.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -33,13 +30,17 @@
*/
static int ext3_release_file (struct inode * inode, struct file * filp)
{
+ if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
+ filemap_flush(inode->i_mapping);
+ ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
+ }
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
{
- down(&EXT3_I(inode)->truncate_sem);
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
ext3_discard_reservation(inode);
- up(&EXT3_I(inode)->truncate_sem);
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
}
if (is_dx(inode) && filp->private_data)
ext3_htree_free_dir_info(filp->private_data);
@@ -47,82 +48,25 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_dentry->d_inode;
- ssize_t ret;
- int err;
-
- ret = generic_file_aio_write(iocb, buf, count, pos);
-
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext3_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext3_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
-}
-
-struct file_operations ext3_file_operations = {
+const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
- .readv = generic_file_readv,
- .writev = generic_file_writev,
- .ioctl = ext3_ioctl,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext3_compat_ioctl,
+#endif
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext3_release_file,
.fsync = ext3_sync_file,
- .sendfile = generic_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
};
-struct inode_operations ext3_file_inode_operations = {
- .truncate = ext3_truncate,
+const struct inode_operations ext3_file_inode_operations = {
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
@@ -130,6 +74,8 @@ struct inode_operations ext3_file_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
+ .fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 49382a208e0..1cb9c7e10c6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -8,27 +8,23 @@
* Universite Pierre et Marie Curie (Paris VI)
* from
* linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
- *
+ *
* ext3fs fsync primitive
*
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
- *
+ *
* Removed unnecessary code duplication for little endian machines
- * and excessive __inline__s.
+ * and excessive __inline__s.
* Andi Kleen, 1997
*
* Major simplications and cleanup - we only need to do the metadata, because
* we can depend on generic_block_fdatasync() to sync the data blocks.
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/blkdev.h>
#include <linux/writeback.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
/*
* akpm: A new design for ext3_sync_file().
@@ -42,22 +38,34 @@
* inode to disk.
*/
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = dentry->d_inode;
- int ret = 0;
+ struct inode *inode = file->f_mapping->host;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+ int ret, needs_barrier = 0;
+ tid_t commit_tid;
+
+ trace_ext3_sync_file_enter(file, datasync);
+
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated state */
+ smp_rmb();
+ if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+ return -EROFS;
+ return 0;
+ }
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out;
- J_ASSERT(ext3_journal_current_handle() == 0);
+ J_ASSERT(ext3_journal_current_handle() == NULL);
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for a proper transaction
+ * to commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -72,17 +80,30 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
goto out;
}
+ if (datasync)
+ commit_tid = atomic_read(&ei->i_datasync_tid);
+ else
+ commit_tid = atomic_read(&ei->i_sync_tid);
+
+ if (test_opt(inode->i_sb, BARRIER) &&
+ !journal_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = 1;
+ log_start_commit(journal, commit_tid);
+ ret = log_wait_commit(journal, commit_tid);
+
/*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
+ * In case we didn't commit a transaction, we have to flush
+ * disk caches manually so that data really is on persistent
+ * storage
*/
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- ret = sync_inode(inode, &wbc);
+ if (needs_barrier) {
+ int err;
+
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
}
out:
+ trace_ext3_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index 5a2d1235ead..ede315cdf12 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -4,15 +4,12 @@
* Copyright (C) 2002 by Theodore Ts'o
*
* This file is released under the GPL v2.
- *
+ *
* This file may be redistributed under the terms of the GNU Public
* License.
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/sched.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/cryptohash.h>
#define DELTA 0x9E3779B9
@@ -36,23 +33,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
/* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const unsigned char *ucp = (const unsigned char *) name;
+
while (len--) {
- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+ hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
- if (hash & 0x80000000) hash -= 0x7fffffff;
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
- return (hash0 << 1);
+ return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const signed char *scp = (const signed char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const signed char *scp = (const signed char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) scp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
}
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
+ const unsigned char *ucp = (const unsigned char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
@@ -63,7 +108,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
for (i=0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
- val = msg[i] + (val << 8);
+ val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
@@ -80,11 +125,11 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
* Returns the hash of a filename. If len is 0 and name is NULL, then
* this function can be used to test whether or not a hash version is
* supported.
- *
+ *
* The seed is an 4 longword (32 bits) "secret" which can be used to
* uniquify a hash. If the seed is all zero's, then some default seed
* may be used.
- *
+ *
* A particular hash version specifies whether or not the seed is
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
@@ -95,7 +140,9 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
__u32 minor_hash = 0;
const char *p;
int i;
- __u32 in[8], buf[4];
+ __u32 in[8], buf[4];
+ void (*str2hashbuf)(const char *, int, __u32 *, int) =
+ str2hashbuf_signed;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -114,13 +161,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
}
switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY_UNSIGNED:
+ hash = dx_hack_hash_unsigned(name, len);
+ break;
case DX_HASH_LEGACY:
- hash = dx_hack_hash(name, len);
+ hash = dx_hack_hash_signed(name, len);
break;
+ case DX_HASH_HALF_MD4_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 8);
+ (*str2hashbuf)(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -128,10 +180,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
minor_hash = buf[2];
hash = buf[1];
break;
+ case DX_HASH_TEA_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 4);
+ (*str2hashbuf)(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
@@ -144,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT3_HTREE_EOF << 1))
- hash = (EXT3_HTREE_EOF-1) << 1;
+ if (hash == (EXT3_HTREE_EOF_32BIT << 1))
+ hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f31..a1b810230cc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -12,20 +12,10 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
#include <linux/random.h>
-#include <linux/bitops.h>
-
-#include <asm/byteorder.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -118,21 +108,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
ino = inode->i_ino;
ext3_debug ("freeing inode %lu\n", ino);
-
- /*
- * Note: we must free any quota before locking the superblock,
- * as writing the quota to disk may need the lock as well.
- */
- DQUOT_INIT(inode);
- ext3_xattr_delete_inode(handle, inode);
- DQUOT_FREE_INODE(inode);
- DQUOT_DROP(inode);
+ trace_ext3_free_inode(inode);
is_directory = S_ISDIR(inode->i_mode);
- /* Do this BEFORE marking the inode not in use or returning an error */
- clear_inode (inode);
-
es = EXT3_SB(sb)->s_es;
if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
ext3_error (sb, "ext3_free_inode",
@@ -164,11 +143,9 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
if (gdp) {
spin_lock(sb_bgl_lock(sbi, block_group));
- gdp->bg_free_inodes_count = cpu_to_le16(
- le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+ le16_add_cpu(&gdp->bg_free_inodes_count, 1);
if (is_directory)
- gdp->bg_used_dirs_count = cpu_to_le16(
- le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+ le16_add_cpu(&gdp->bg_used_dirs_count, -1);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (is_directory)
@@ -183,91 +160,49 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (!fatal)
fatal = err;
- sb->s_dirt = 1;
+
error_return:
brelse(bitmap_bh);
ext3_std_error(sb, fatal);
}
/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
+ * Orlov's allocator for directories.
*
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
-{
- int ngroups = EXT3_SB(sb)->s_groups_count;
- int freei, avefreei;
- struct ext3_group_desc *desc, *best_desc = NULL;
- struct buffer_head *bh;
- int group, best_group = -1;
-
- freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
-
- for (group = 0; group < ngroups; group++) {
- desc = ext3_get_group_desc (sb, group, &bh);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
- continue;
- if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
- best_group = group;
- best_desc = desc;
- }
- }
- return best_group;
-}
-
-/*
- * Orlov's allocator for directories.
- *
* We always try to spread first-level directories.
*
- * If there are blockgroups with both free inodes and free blocks counts
- * not worse than average we return one with smallest directory count.
- * Otherwise we simply return a random group.
- *
- * For the rest rules look so:
- *
- * It's OK to put directory into a group unless
- * it has too many directories already (max_dirs) or
- * it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
- * conditions we search cyclically through the rest. If none
- * of the groups look good we just look for a group with more
- * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
- */
-
-#define INODE_COST 64
-#define BLOCK_COST 256
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks).
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
static int find_group_orlov(struct super_block *sb, struct inode *parent)
{
int parent_group = EXT3_I(parent)->i_block_group;
struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
- int freei, avefreei;
- int freeb, avefreeb;
- int blocks_per_dir, ndirs;
- int max_debt, max_dirs, min_blocks, min_inodes;
+ unsigned int freei, avefreei;
+ ext3_fsblk_t freeb, avefreeb;
+ unsigned int ndirs;
+ int max_dirs, min_inodes;
+ ext3_grpblk_t min_blocks;
int group = -1, i;
struct ext3_group_desc *desc;
- struct buffer_head *bh;
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -280,11 +215,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
@@ -301,23 +236,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
goto fallback;
}
- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
- max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
-
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
@@ -332,7 +257,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
fallback:
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
@@ -356,14 +281,13 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
int parent_group = EXT3_I(parent)->i_block_group;
int ngroups = EXT3_SB(sb)->s_groups_count;
struct ext3_group_desc *desc;
- struct buffer_head *bh;
int group, i;
/*
* Try to place the inode in its parent directory
*/
group = parent_group;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
le16_to_cpu(desc->bg_free_blocks_count))
return group;
@@ -387,7 +311,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
group += i;
if (group >= ngroups)
group -= ngroups;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
le16_to_cpu(desc->bg_free_blocks_count))
return group;
@@ -401,7 +325,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
for (i = 0; i < ngroups; i++) {
if (++group >= ngroups)
group = 0;
- desc = ext3_get_group_desc (sb, group, &bh);
+ desc = ext3_get_group_desc (sb, group, NULL);
if (desc && le16_to_cpu(desc->bg_free_inodes_count))
return group;
}
@@ -419,7 +343,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+ const struct qstr *qstr, umode_t mode)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
@@ -440,6 +365,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
+ trace_ext3_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -447,12 +373,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT3_SB(sb);
es = sbi->s_es;
- if (S_ISDIR(mode)) {
- if (test_opt (sb, OLDALLOC))
- group = find_group_dir(sb, dir);
- else
- group = find_group_orlov(sb, dir);
- } else
+ if (S_ISDIR(mode))
+ group = find_group_orlov(sb, dir);
+ else
group = find_group_other(sb, dir);
err = -ENOSPC;
@@ -528,11 +451,9 @@ got:
err = ext3_journal_get_write_access(handle, bh2);
if (err) goto fail;
spin_lock(sb_bgl_lock(sbi, group));
- gdp->bg_free_inodes_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+ le16_add_cpu(&gdp->bg_free_inodes_count, -1);
if (S_ISDIR(mode)) {
- gdp->bg_used_dirs_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+ le16_add_cpu(&gdp->bg_used_dirs_count, 1);
}
spin_unlock(sb_bgl_lock(sbi, group));
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
@@ -542,22 +463,17 @@ got:
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- sb->s_dirt = 1;
- inode->i_uid = current->fsuid;
- if (test_opt (sb, GRPID))
- inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
+
+ if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
} else
- inode->i_gid = current->fsgid;
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
/* This is the optimal IO size (for stat), not the fs block size */
- inode->i_blksize = PAGE_SIZE;
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -565,12 +481,8 @@ got:
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
- /* dirsync only applies to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT3_DIRSYNC_FL;
+ ei->i_flags =
+ ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
#ifdef EXT3_FRAGMENTS
ei->i_faddr = 0;
ei->i_frag_no = 0;
@@ -585,27 +497,41 @@ got:
ext3_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
handle->h_sync = 1;
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
+ }
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- ei->i_state = EXT3_STATE_NEW;
- ei->i_extra_isize =
- (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+ ei->i_state_flags = 0;
+ ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
+ /* See comment in ext3_iget for explanation */
+ if (ino >= EXT3_FIRST_INO(sb) + 1 &&
+ EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize =
+ sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+ } else {
+ ei->i_extra_isize = 0;
+ }
ret = inode;
- if(DQUOT_ALLOC_INODE(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext3_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
- err = ext3_init_security(handle,inode, dir);
+ err = ext3_init_security(handle, inode, dir, qstr);
if (err)
goto fail_free_drop;
@@ -616,6 +542,7 @@ got:
}
ext3_debug("allocating inode %lu\n", inode->i_ino);
+ trace_ext3_allocate_inode(inode, dir, mode);
goto really_out;
fail:
ext3_std_error(sb, err);
@@ -627,12 +554,13 @@ really_out:
return ret;
fail_free_drop:
- DQUOT_FREE_INODE(inode);
+ dquot_free_inode(inode);
fail_drop:
- DQUOT_DROP(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
+ clear_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
brelse(bitmap_bh);
return ERR_PTR(err);
@@ -644,54 +572,75 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
unsigned long block_group;
int bit;
- struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *bitmap_bh;
struct inode *inode = NULL;
+ long err = -EIO;
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"bad orphan ino %lu! e2fsck was run?", ino);
- goto out;
+ goto error;
}
block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
bitmap_bh = read_inode_bitmap(sb, block_group);
if (!bitmap_bh) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"inode bitmap error for orphan %lu", ino);
- goto out;
+ goto error;
}
/* Having the inode bit set should be a 100% indicator that this
* is a valid orphan (no e2fsck run on fs). Orphans also include
* inodes that were being truncated, so we can't check i_nlink==0.
*/
- if (!ext3_test_bit(bit, bitmap_bh->b_data) ||
- !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
- NEXT_ORPHAN(inode) > max_ino) {
- ext3_warning(sb, __FUNCTION__,
- "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
- bit, (unsigned long long)bitmap_bh->b_blocknr,
- ext3_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
- if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
- is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
- NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
- }
+ if (!ext3_test_bit(bit, bitmap_bh->b_data))
+ goto bad_orphan;
+
+ inode = ext3_iget(sb, ino);
+ if (IS_ERR(inode))
+ goto iget_failed;
+
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext3_can_truncate(inode))
+ goto bad_orphan;
+
+ if (NEXT_ORPHAN(inode) > max_ino)
+ goto bad_orphan;
+ brelse(bitmap_bh);
+ return inode;
+
+iget_failed:
+ err = PTR_ERR(inode);
+ inode = NULL;
+bad_orphan:
+ ext3_warning(sb, __func__,
+ "bad orphan inode %lu! e2fsck was run?", ino);
+ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext3_test_bit(bit, bitmap_bh->b_data));
+ printk(KERN_NOTICE "inode=%p\n", inode);
+ if (inode) {
+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ is_bad_inode(inode));
+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ NEXT_ORPHAN(inode));
+ printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
- if (inode && inode->i_nlink == 0)
+ if (inode->i_nlink == 0)
inode->i_blocks = 0;
iput(inode);
- inode = NULL;
}
-out:
brelse(bitmap_bh);
- return inode;
+error:
+ return ERR_PTR(err);
}
unsigned long ext3_count_free_inodes (struct super_block * sb)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8824e84f8a5..2c6ccc49ba2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -13,63 +13,56 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
+ * (sct@redhat.com), 1993, 1998
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
- * (jj@sunsite.ms.mff.cuni.cz)
+ * (jj@sunsite.ms.mff.cuni.cz)
*
* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/ext3_jbd.h>
-#include <linux/jbd.h>
-#include <linux/smp_lock.h>
#include <linux/highuid.h>
-#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
-#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/aio.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
/*
* Test whether an inode is a fast symlink.
*/
-static inline int ext3_inode_is_fast_symlink(struct inode *inode)
+static int ext3_inode_is_fast_symlink(struct inode *inode)
{
int ea_blocks = EXT3_I(inode)->i_file_acl ?
(inode->i_sb->s_blocksize >> 9) : 0;
- return (S_ISLNK(inode->i_mode) &&
- inode->i_blocks - ea_blocks == 0);
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
-/* The ext3 forget function must perform a revoke if we are freeing data
+/*
+ * The ext3 forget function must perform a revoke if we are freeing data
* which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
+ * revoked in all cases.
*
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
*/
-
-int ext3_forget(handle_t *handle, int is_metadata,
- struct inode *inode, struct buffer_head *bh,
- int blocknr)
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext3_fsblk_t blocknr)
{
int err;
might_sleep();
+ trace_ext3_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -97,18 +90,17 @@ int ext3_forget(handle_t *handle, int is_metadata,
BUFFER_TRACE(bh, "call ext3_journal_revoke");
err = ext3_journal_revoke(handle, blocknr, bh);
if (err)
- ext3_abort(inode->i_sb, __FUNCTION__,
+ ext3_abort(inode->i_sb, __func__,
"error %d when attempting revoke", err);
BUFFER_TRACE(bh, "exit");
return err;
}
/*
- * Work out how many blocks we need to progress with the next chunk of a
+ * Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
*/
-
-static unsigned long blocks_for_truncate(struct inode *inode)
+static unsigned long blocks_for_truncate(struct inode *inode)
{
unsigned long needed;
@@ -125,13 +117,13 @@ static unsigned long blocks_for_truncate(struct inode *inode)
/* But we need to bound the transaction so we don't overflow the
* journal. */
- if (needed > EXT3_MAX_TRANS_DATA)
+ if (needed > EXT3_MAX_TRANS_DATA)
needed = EXT3_MAX_TRANS_DATA;
return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
}
-/*
+/*
* Truncate transactions can be complex and absolutely huge. So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
@@ -139,10 +131,9 @@ static unsigned long blocks_for_truncate(struct inode *inode)
* start_transaction gets us a new handle for a truncate transaction,
* and extend_transaction tries to extend the existing one a bit. If
* extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
+ * transaction in the top-level truncate loop. --sct
*/
-
-static handle_t *start_transaction(struct inode *inode)
+static handle_t *start_transaction(struct inode *inode)
{
handle_t *result;
@@ -174,29 +165,87 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
- * Called at the last iput() if i_nlink is zero.
+ * Called at inode eviction from icache
*/
-void ext3_delete_inode (struct inode * inode)
+void ext3_evict_inode (struct inode *inode)
{
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_block_alloc_info *rsv;
handle_t *handle;
+ int want_delete = 0;
+
+ trace_ext3_evict_inode(inode);
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ dquot_initialize(inode);
+ want_delete = 1;
+ }
+
+ /*
+ * When journalling data dirty buffers are tracked only in the journal.
+ * So although mm thinks everything is clean and ready for reaping the
+ * inode might still have some pages to write in the running
+ * transaction or waiting to be checkpointed. Thus calling
+ * journal_invalidatepage() (via truncate_inode_pages()) to discard
+ * these buffers can cause data loss. Also even if we did not discard
+ * these buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to read
+ * them before the transaction is checkpointed. So be careful and
+ * force everything to disk here... We use ei->i_datasync_tid to
+ * store the newest transaction containing inode's data.
+ *
+ * Note that directories do not have this problem because they don't
+ * use page cache.
+ *
+ * The s_journal check handles the case when ext3_get_journal() fails
+ * and puts the journal inode.
+ */
+ if (inode->i_nlink && ext3_should_journal_data(inode) &&
+ EXT3_SB(inode->i_sb)->s_journal &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT3_JOURNAL_INO) {
+ tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+ log_start_commit(journal, commit_tid);
+ log_wait_commit(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
- truncate_inode_pages(&inode->i_data, 0);
+ ext3_discard_reservation(inode);
+ rsv = ei->i_block_alloc_info;
+ ei->i_block_alloc_info = NULL;
+ if (unlikely(rsv))
+ kfree(rsv);
- if (is_bad_inode(inode))
+ if (!want_delete)
goto no_delete;
handle = start_transaction(inode);
if (IS_ERR(handle)) {
- /* If we're going to skip the normal cleanup, we still
- * need to make sure that the in-core orphan linked list
- * is properly cleaned up. */
+ /*
+ * If we're going to skip the normal cleanup, we still need to
+ * make sure that the in-core orphan linked list is properly
+ * cleaned up.
+ */
ext3_orphan_del(NULL, inode);
goto no_delete;
}
@@ -207,44 +256,39 @@ void ext3_delete_inode (struct inode * inode)
if (inode->i_blocks)
ext3_truncate(inode);
/*
- * Kill off the orphan record which ext3_truncate created.
- * AKPM: I think this can be inside the above `if'.
- * Note that ext3_orphan_del() has to be able to cope with the
- * deletion of a non-existent orphan - this is because we don't
- * know if ext3_truncate() actually created an orphan record.
- * (Well, we could do this if we need to, but heck - it works)
+ * Kill off the orphan record created when the inode lost the last
+ * link. Note that ext3_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - ext3_truncate() could
+ * have removed the record.
*/
ext3_orphan_del(handle, inode);
- EXT3_I(inode)->i_dtime = get_seconds();
+ ei->i_dtime = get_seconds();
- /*
+ /*
* One subtle ordering requirement: if anything has gone wrong
* (transaction abort, IO errors, whatever), then we can still
* do these next steps (the fs will already have been marked as
* having errors), but we can't free the inode if the mark_dirty
- * fails.
+ * fails.
*/
- if (ext3_mark_inode_dirty(handle, inode))
- /* If that failed, just do the required in-core inode clear. */
+ if (ext3_mark_inode_dirty(handle, inode)) {
+ /* If that failed, just dquot_drop() and be done with that */
+ dquot_drop(inode);
+ clear_inode(inode);
+ } else {
+ ext3_xattr_delete_inode(handle, inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode(inode);
- else
ext3_free_inode(handle, inode);
+ }
ext3_journal_stop(handle);
return;
no_delete:
- clear_inode(inode); /* We must guarantee clearing of inode... */
-}
-
-static int ext3_alloc_block (handle_t *handle,
- struct inode * inode, unsigned long goal, int *err)
-{
- unsigned long result;
-
- result = ext3_new_block(handle, inode, goal, err);
- return result;
+ clear_inode(inode);
+ dquot_drop(inode);
}
-
typedef struct {
__le32 *p;
__le32 key;
@@ -257,7 +301,7 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
p->bh = bh;
}
-static inline int verify_chain(Indirect *from, Indirect *to)
+static int verify_chain(Indirect *from, Indirect *to)
{
while (from <= to && from->key == *from->p)
from++;
@@ -327,10 +371,10 @@ static int ext3_block_to_path(struct inode *inode,
offsets[n++] = i_block & (ptrs - 1);
final = ptrs;
} else {
- ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
+ ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
}
if (boundary)
- *boundary = (i_block & (ptrs - 1)) == (final - 1);
+ *boundary = final - 1 - (i_block & (ptrs - 1));
return n;
}
@@ -404,13 +448,13 @@ no_block:
* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
* + if pointer will live in indirect block - allocate near that block.
* + if pointer will live in inode - allocate in the same
- * cylinder group.
+ * cylinder group.
*
* In the latter case we colour the starting block by the callers PID to
* prevent it from clashing with concurrent allocations for a different inode
@@ -419,51 +463,50 @@ no_block:
*
* Caller must make sure that @ind is valid and will stay that way.
*/
-
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
{
struct ext3_inode_info *ei = EXT3_I(inode);
__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
__le32 *p;
- unsigned long bg_start;
- unsigned long colour;
+ ext3_fsblk_t bg_start;
+ ext3_grpblk_t colour;
/* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--)
+ for (p = ind->p - 1; p >= start; p--) {
if (*p)
return le32_to_cpu(*p);
+ }
/* No such thing, so let's try location of indirect block */
if (ind->bh)
return ind->bh->b_blocknr;
/*
- * It is going to be refered from inode itself? OK, just put it into
- * the same cylinder group then.
+ * It is going to be referred to from the inode itself? OK, just put it
+ * into the same cylinder group then.
*/
- bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
- le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+ bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
colour = (current->pid % 16) *
(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
return bg_start + colour;
}
/**
- * ext3_find_goal - find a prefered place for allocation.
+ * ext3_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
- * @chain: chain of indirect blocks
* @partial: pointer to the last triple within a chain
- * @goal: place to store the result.
*
- * Normally this function find the prefered place for block allocation,
- * stores it in *@goal and returns zero.
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
*/
-static unsigned long ext3_find_goal(struct inode *inode, long block,
- Indirect chain[4], Indirect *partial)
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
+ Indirect *partial)
{
- struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
+ struct ext3_block_alloc_info *block_i;
+
+ block_i = EXT3_I(inode)->i_block_alloc_info;
/*
* try the heuristic for sequential allocation,
@@ -478,13 +521,119 @@ static unsigned long ext3_find_goal(struct inode *inode, long block,
}
/**
+ * ext3_blks_to_allocate - Look up the block map and count the number
+ * of direct blocks need to be allocated for the given branch.
+ *
+ * @branch: chain of indirect blocks
+ * @k: number of blocks need for indirect blocks
+ * @blks: number of data blocks to be mapped.
+ * @blocks_to_boundary: the offset in the indirect block
+ *
+ * return the total number of blocks to be allocate, including the
+ * direct and indirect blocks.
+ */
+static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+ int blocks_to_boundary)
+{
+ unsigned long count = 0;
+
+ /*
+ * Simple case, [t,d]Indirect block(s) has not allocated yet
+ * then it's clear blocks on that path have not allocated
+ */
+ if (k > 0) {
+ /* right now we don't handle cross boundary allocation */
+ if (blks < blocks_to_boundary + 1)
+ count += blks;
+ else
+ count += blocks_to_boundary + 1;
+ return count;
+ }
+
+ count++;
+ while (count < blks && count <= blocks_to_boundary &&
+ le32_to_cpu(*(branch[0].p + count)) == 0) {
+ count++;
+ }
+ return count;
+}
+
+/**
+ * ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @goal: preferred place for allocation
+ * @indirect_blks: the number of blocks need to allocate for indirect
+ * blocks
+ * @blks: number of blocks need to allocated for direct blocks
+ * @new_blocks: on return it will store the new block numbers for
+ * the indirect blocks(if needed) and the first direct block,
+ * @err: here we store the error value
+ *
+ * return the number of direct blocks allocated
+ */
+static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int indirect_blks, int blks,
+ ext3_fsblk_t new_blocks[4], int *err)
+{
+ int target, i;
+ unsigned long count = 0;
+ int index = 0;
+ ext3_fsblk_t current_block = 0;
+ int ret = 0;
+
+ /*
+ * Here we try to allocate the requested multiple blocks at once,
+ * on a best-effort basis.
+ * To build a branch, we should allocate blocks for
+ * the indirect blocks(if not allocated yet), and at least
+ * the first direct block of this branch. That's the
+ * minimum number of blocks need to allocate(required)
+ */
+ target = blks + indirect_blks;
+
+ while (1) {
+ count = target;
+ /* allocating blocks for indirect blocks and direct blocks */
+ current_block = ext3_new_blocks(handle,inode,goal,&count,err);
+ if (*err)
+ goto failed_out;
+
+ target -= count;
+ /* allocate blocks for indirect blocks */
+ while (index < indirect_blks && count) {
+ new_blocks[index++] = current_block++;
+ count--;
+ }
+
+ if (count > 0)
+ break;
+ }
+
+ /* save the new block number for the first direct block */
+ new_blocks[index] = current_block;
+
+ /* total number of blocks allocated for direct blocks */
+ ret = count;
+ *err = 0;
+ return ret;
+failed_out:
+ for (i = 0; i <index; i++)
+ ext3_free_blocks(handle, inode, new_blocks[i], 1);
+ return ret;
+}
+
+/**
* ext3_alloc_branch - allocate and set up a chain of blocks.
+ * @handle: handle for this transaction
* @inode: owner
- * @num: depth of the chain (number of blocks to allocate)
+ * @indirect_blks: number of allocated indirect blocks
+ * @blks: number of allocated direct blocks
+ * @goal: preferred place for allocation
* @offsets: offsets (in the blocks) to store the pointers to next.
* @branch: place to store the chain in.
*
- * This function allocates @num blocks, zeroes out all but the last one,
+ * This function allocates blocks, zeroes out all but the last one,
* links them into chain and (if we are synchronous) writes them to disk.
* In other words, it prepares a branch that can be spliced onto the
* inode. It stores the information about that chain in the branch[], in
@@ -501,97 +650,111 @@ static unsigned long ext3_find_goal(struct inode *inode, long block,
* ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
* as described above and return 0.
*/
-
static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
- int num,
- unsigned long goal,
- int *offsets,
- Indirect *branch)
+ int indirect_blks, int *blks, ext3_fsblk_t goal,
+ int *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
- int n = 0, keys = 0;
+ int i, n = 0;
int err = 0;
- int i;
- int parent = ext3_alloc_block(handle, inode, goal, &err);
-
- branch[0].key = cpu_to_le32(parent);
- if (parent) {
- for (n = 1; n < num; n++) {
- struct buffer_head *bh;
- /* Allocate the next block */
- int nr = ext3_alloc_block(handle, inode, parent, &err);
- if (!nr)
- break;
- branch[n].key = cpu_to_le32(nr);
+ struct buffer_head *bh;
+ int num;
+ ext3_fsblk_t new_blocks[4];
+ ext3_fsblk_t current_block;
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, parent);
- if (!bh)
- break;
- keys = n+1;
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext3_journal_get_create_access(handle, bh);
- if (err) {
- unlock_buffer(bh);
- brelse(bh);
- break;
- }
+ num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
+ *blks, new_blocks, &err);
+ if (err)
+ return err;
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32*) bh->b_data + offsets[n];
- *branch[n].p = branch[n].key;
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
+ branch[0].key = cpu_to_le32(new_blocks[0]);
+ /*
+ * metadata blocks and data blocks are allocated.
+ */
+ for (n = 1; n <= indirect_blks; n++) {
+ /*
+ * Get buffer_head for parent block, zero it out
+ * and set the pointer to new one, then send
+ * parent to disk.
+ */
+ bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ branch[n].bh = bh;
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext3_journal_get_create_access(handle, bh);
+ if (err) {
unlock_buffer(bh);
+ brelse(bh);
+ goto failed;
+ }
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh);
- if (err)
- break;
-
- parent = nr;
+ memset(bh->b_data, 0, blocksize);
+ branch[n].p = (__le32 *) bh->b_data + offsets[n];
+ branch[n].key = cpu_to_le32(new_blocks[n]);
+ *branch[n].p = branch[n].key;
+ if ( n == indirect_blks) {
+ current_block = new_blocks[n];
+ /*
+ * End of chain, update the last new metablock of
+ * the chain to point to the new allocated
+ * data blocks numbers
+ */
+ for (i=1; i < num; i++)
+ *(branch[n].p + i) = cpu_to_le32(++current_block);
}
- }
- if (n == num)
- return 0;
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto failed;
+ }
+ *blks = num;
+ return err;
+failed:
/* Allocation failed, free what we already allocated */
- for (i = 1; i < keys; i++) {
+ for (i = 1; i <= n ; i++) {
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i < keys; i++)
- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
+ for (i = 0; i < indirect_blks; i++)
+ ext3_free_blocks(handle, inode, new_blocks[i], 1);
+
+ ext3_free_blocks(handle, inode, new_blocks[i], num);
+
return err;
}
/**
- * ext3_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext3_alloc_branch)
- * @where: location of missing link
- * @num: number of blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
+ * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @where: location of missing link
+ * @num: number of indirect blocks we are adding
+ * @blks: number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
*/
-
-static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
- Indirect chain[4], Indirect *where, int num)
+static int ext3_splice_branch(handle_t *handle, struct inode *inode,
+ long block, Indirect *where, int num, int blks)
{
int i;
int err = 0;
- struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
+ struct ext3_block_alloc_info *block_i;
+ ext3_fsblk_t current_block;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct timespec now;
+ block_i = ei->i_block_alloc_info;
/*
* If we're splicing into a [td]indirect block (as opposed to the
* inode) then we need to get write access to the [td]indirect block
@@ -608,24 +771,39 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
*where->p = where->key;
/*
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+ if (num == 0 && blks > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+ for (i = 1; i < blks; i++)
+ *(where->p + i ) = cpu_to_le32(current_block++);
+ }
+
+ /*
* update the most recently allocated logical & physical block
* in i_block_alloc_info, to assist find the proper goal block for next
* allocation
*/
if (block_i) {
- block_i->last_alloc_logical_block = block;
- block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
+ block_i->last_alloc_logical_block = block + blks - 1;
+ block_i->last_alloc_physical_block =
+ le32_to_cpu(where[num].key) + blks - 1;
}
/* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
+ now = CURRENT_TIME_SEC;
+ if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
+ inode->i_ctime = now;
+ ext3_mark_inode_dirty(handle, inode);
+ }
+ /* ext3_mark_inode_dirty already updated i_sync_tid */
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
/* had we spliced it onto indirect block? */
if (where->bh) {
/*
- * akpm: If we spliced it onto an indirect block, we haven't
+ * If we spliced it onto an indirect block, we haven't
* altered the inode. Note however that if it is being spliced
* onto an indirect block at the very end of the file (the
* file is growing) then we *will* alter the inode to reflect
@@ -635,7 +813,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
jbd_debug(5, "splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, where->bh);
- if (err)
+ if (err)
goto err_out;
} else {
/*
@@ -647,10 +825,13 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
return err;
err_out:
- for (i = 1; i < num; i++) {
+ for (i = 1; i <= num; i++) {
BUFFER_TRACE(where[i].bh, "call journal_forget");
ext3_journal_forget(handle, where[i].bh);
+ ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
}
+ ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+
return err;
}
@@ -666,26 +847,34 @@ err_out:
* allocations is needed - we simply release blocks and do not touch anything
* reachable from inode.
*
- * akpm: `handle' can be NULL if create == 0.
+ * `handle' can be NULL if create == 0.
*
* The BKL may not be held on entry here. Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
*/
-
-static int
-ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, int extend_disksize)
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+ sector_t iblock, unsigned long maxblocks,
+ struct buffer_head *bh_result,
+ int create)
{
int err = -EIO;
int offsets[4];
Indirect chain[4];
Indirect *partial;
- unsigned long goal;
- int left;
- int boundary = 0;
- const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
+ ext3_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
struct ext3_inode_info *ei = EXT3_I(inode);
+ int count = 0;
+ ext3_fsblk_t first_block = 0;
+
+ trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
J_ASSERT(handle != NULL || create == 0);
+ depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
if (depth == 0)
goto out;
@@ -694,15 +883,44 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
/* Simplest case - block found, no allocation needed */
if (!partial) {
+ first_block = le32_to_cpu(chain[depth - 1].key);
clear_buffer_new(bh_result);
- goto got_it;
+ count++;
+ /*map more blocks*/
+ while (count < maxblocks && count <= blocks_to_boundary) {
+ ext3_fsblk_t blk;
+
+ if (!verify_chain(chain, chain + depth - 1)) {
+ /*
+ * Indirect block might be removed by
+ * truncate while we were reading it.
+ * Handling of that case: forget what we've
+ * got now. Flag the err as EAGAIN, so it
+ * will reread.
+ */
+ err = -EAGAIN;
+ count = 0;
+ break;
+ }
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
+ count++;
+ else
+ break;
+ }
+ if (err != -EAGAIN)
+ goto got_it;
}
/* Next simple case - plain lookup or failed read of indirect block */
if (!create || err == -EIO)
goto cleanup;
- down(&ei->truncate_sem);
+ /*
+ * Block out ext3_truncate while we alter the tree
+ */
+ mutex_lock(&ei->truncate_mutex);
/*
* If the indirect block is missing while we are reading
@@ -723,7 +941,8 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
}
partial = ext3_get_branch(inode, depth, offsets, chain, &err);
if (!partial) {
- up(&ei->truncate_sem);
+ count++;
+ mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
clear_buffer_new(bh_result);
@@ -738,14 +957,18 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
ext3_init_block_alloc_info(inode);
- goal = ext3_find_goal(inode, iblock, chain, partial);
+ goal = ext3_find_goal(inode, iblock, partial);
- left = (chain + depth) - partial;
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
/*
- * Block out ext3_truncate while we alter the tree
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
*/
- err = ext3_alloc_branch(handle, inode, left, goal,
+ count = ext3_blks_to_allocate(partial, indirect_blks,
+ maxblocks, blocks_to_boundary);
+ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
offsets + (partial - chain), partial);
/*
@@ -756,24 +979,18 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
* may need to return -EAGAIN upwards in the worst case. --sct
*/
if (!err)
- err = ext3_splice_branch(handle, inode, iblock, chain,
- partial, left);
- /*
- * i_disksize growing is protected by truncate_sem. Don't forget to
- * protect it if you're about to implement concurrent
- * ext3_get_block() -bzzz
- */
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
- up(&ei->truncate_sem);
+ err = ext3_splice_branch(handle, inode, iblock,
+ partial, indirect_blks, count);
+ mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
set_buffer_new(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
- if (boundary)
+ if (count > blocks_to_boundary)
set_buffer_boundary(bh_result);
+ err = count;
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
cleanup:
@@ -784,75 +1001,66 @@ cleanup:
}
BUFFER_TRACE(bh_result, "returned");
out:
+ trace_ext3_get_blocks_exit(inode, iblock,
+ depth ? le32_to_cpu(chain[depth-1].key) : 0,
+ count, err);
return err;
}
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
+
static int ext3_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- handle_t *handle = NULL;
- int ret;
+ handle_t *handle = ext3_journal_current_handle();
+ int ret = 0, started = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ if (create && !handle) { /* Direct IO write... */
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ handle = ext3_journal_start(inode, DIO_CREDITS +
+ EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ started = 1;
+ }
- if (create) {
- handle = ext3_journal_current_handle();
- J_ASSERT(handle != 0);
+ ret = ext3_get_blocks_handle(handle, inode, iblock,
+ max_blocks, bh_result, create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
}
- ret = ext3_get_block_handle(handle, inode, iblock,
- bh_result, create, 1);
+ if (started)
+ ext3_journal_stop(handle);
+out:
return ret;
}
-#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
-
-static int
-ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
- unsigned long max_blocks, struct buffer_head *bh_result,
- int create)
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
{
- handle_t *handle = journal_current_handle();
- int ret = 0;
-
- if (!handle)
- goto get_block; /* A read */
-
- if (handle->h_transaction->t_state == T_LOCKED) {
- /*
- * Huge direct-io writes can hold off commits for long
- * periods of time. Let this commit run.
- */
- ext3_journal_stop(handle);
- handle = ext3_journal_start(inode, DIO_CREDITS);
- if (IS_ERR(handle))
- ret = PTR_ERR(handle);
- goto get_block;
- }
-
- if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
- /*
- * Getting low on buffer credits...
- */
- ret = ext3_journal_extend(handle, DIO_CREDITS);
- if (ret > 0) {
- /*
- * Couldn't extend the transaction. Start a new one.
- */
- ret = ext3_journal_restart(handle, DIO_CREDITS);
- }
- }
-
-get_block:
- if (ret == 0)
- ret = ext3_get_block_handle(handle, inode, iblock,
- bh_result, create, 0);
- bh_result->b_size = (1 << inode->i_blkbits);
- return ret;
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext3_get_block);
}
/*
* `handle' can be NULL if create is zero
*/
-struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
- long block, int create, int * errp)
+struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
+ long block, int create, int *errp)
{
struct buffer_head dummy;
int fatal = 0, err;
@@ -862,29 +1070,40 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
dummy.b_state = 0;
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
- if (!*errp && buffer_mapped(&dummy)) {
+ err = ext3_get_blocks_handle(handle, inode, block, 1,
+ &dummy, create);
+ /*
+ * ext3_get_blocks_handle() returns number of blocks
+ * mapped. 0 in case of a HOLE.
+ */
+ if (err > 0) {
+ WARN_ON(err > 1);
+ err = 0;
+ }
+ *errp = err;
+ if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
J_ASSERT(create != 0);
- J_ASSERT(handle != 0);
-
- /* Now that we do not always journal data, we
- should keep in mind whether this should
- always journal the new buffer as metadata.
- For now, regular file writes use
- ext3_get_block instead, so it's not a
- problem. */
+ J_ASSERT(handle != NULL);
+
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext3_get_block instead, so it's not a
+ * problem.
+ */
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
fatal = ext3_journal_get_create_access(handle, bh);
if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ memset(bh->b_data,0,inode->i_sb->s_blocksize);
set_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -906,7 +1125,7 @@ err:
return NULL;
}
-struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
+struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
int block, int create, int *err)
{
struct buffer_head * bh;
@@ -914,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
bh = ext3_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
- if (buffer_uptodate(bh))
+ if (bh_uptodate_or_lock(bh))
return bh;
- ll_rw_block(READ, 1, &bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -941,7 +1162,7 @@ static int walk_page_buffers( handle_t *handle,
for ( bh = head, block_start = 0;
ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
+ block_start = block_end, bh = next)
{
next = bh->b_this_page;
block_end = block_start + blocksize;
@@ -980,63 +1201,143 @@ static int walk_page_buffers( handle_t *handle,
* So what we do is to rely on the fact that journal_stop/journal_start
* will _not_ run commit under these circumstances because handle->h_ref
* is elevated. We'll still have enough credits for the tiny quotafile
- * write.
+ * write.
*/
-
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+static int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
+ int dirty = buffer_dirty(bh);
+ int ret;
+
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- return ext3_journal_get_write_access(handle, bh);
+ /*
+ * __block_prepare_write() could have dirtied some buffers. Clean
+ * the dirty bit as jbd2_journal_get_write_access() could complain
+ * otherwise about fs integrity issues. Setting of the dirty bit
+ * by __block_prepare_write() isn't a real problem here as we clear
+ * the bit before releasing a page lock and thus writeback cannot
+ * ever write the buffer.
+ */
+ if (dirty)
+ clear_buffer_dirty(bh);
+ ret = ext3_journal_get_write_access(handle, bh);
+ if (!ret && dirty)
+ ret = ext3_journal_dirty_metadata(handle, bh);
+ return ret;
}
-static int ext3_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
{
- struct inode *inode = page->mapping->host;
- int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext3_truncate(inode);
+}
+
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+ ext3_block_truncate_page(inode, inode->i_size);
+ ext3_truncate(inode);
+}
+
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret;
handle_t *handle;
int retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ /* Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason */
+ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+
+ trace_ext3_write_begin(inode, pos, len, flags);
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
retry:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
handle = ext3_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
+ unlock_page(page);
+ page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
- if (test_opt(inode->i_sb, NOBH))
- ret = nobh_prepare_write(page, from, to, ext3_get_block);
- else
- ret = block_prepare_write(page, from, to, ext3_get_block);
+ ret = __block_write_begin(page, pos, len, ext3_get_block);
if (ret)
- goto prepare_write_failed;
+ goto write_begin_failed;
if (ext3_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access);
}
-prepare_write_failed:
- if (ret)
+write_begin_failed:
+ if (ret) {
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before truncate
+ * finishes. Do this only if ext3_can_truncate() agrees so
+ * that orphan processing code is happy.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
ext3_journal_stop(handle);
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
+ }
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
out:
return ret;
}
-int
-ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
int err = journal_dirty_data(handle, bh);
if (err)
- ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
- bh, handle,err);
+ ext3_journal_abort_handle(__func__, __func__,
+ bh, handle, err);
return err;
}
-/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * Write could have mapped the buffer but it didn't copy the data in
+ * yet. So avoid filing such buffer into a transaction.
+ */
+ if (buffer_mapped(bh) && buffer_uptodate(bh))
+ return ext3_journal_dirty_data(handle, bh);
+ return 0;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
@@ -1045,99 +1346,150 @@ static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
}
/*
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
+ */
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
+{
+ /* What matters to us is i_disksize. We don't write i_size anywhere */
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ if (pos + copied > EXT3_I(inode)->i_disksize) {
+ EXT3_I(inode)->i_disksize = pos + copied;
+ mark_inode_dirty(inode);
+ }
+}
+
+/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
* ext3 never places buffers on inode->i_mapping->private_list. metadata
* buffers are managed internally.
*/
-
-static int ext3_ordered_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_ordered_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
+ struct inode *inode = file->f_mapping->host;
+ unsigned from, to;
int ret = 0, ret2;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext3_journal_dirty_data);
+ trace_ext3_ordered_write_end(inode, pos, len, copied);
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (ret == 0) {
- /*
- * generic_commit_write() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
- loff_t new_i_size;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + copied;
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, journal_dirty_data_fn);
- new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
- ret = generic_commit_write(file, page, from, to);
- }
+ if (ret == 0)
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
- return ret;
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
+ return ret ? ret : copied;
}
-static int ext3_writeback_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_writeback_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
- int ret = 0, ret2;
- loff_t new_i_size;
-
- new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
+ struct inode *inode = file->f_mapping->host;
+ int ret;
- if (test_opt(inode->i_sb, NOBH))
- ret = nobh_commit_write(file, page, from, to);
- else
- ret = generic_commit_write(file, page, from, to);
+ trace_ext3_writeback_write_end(inode, pos, len, copied);
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
+ ret = ext3_journal_stop(handle);
+ unlock_page(page);
+ page_cache_release(page);
- ret2 = ext3_journal_stop(handle);
- if (!ret)
- ret = ret2;
- return ret;
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
+ return ret ? ret : copied;
}
-static int ext3_journalled_commit_write(struct file *file,
- struct page *page, unsigned from, unsigned to)
+static int ext3_journalled_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
+ struct ext3_inode_info *ei = EXT3_I(inode);
int ret = 0, ret2;
int partial = 0;
- loff_t pos;
+ unsigned from, to;
- /*
- * Here we duplicate the generic_commit_write() functionality
- */
- pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ trace_ext3_journalled_write_end(inode, pos, len, copied);
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from + copied, to);
+ to = from + copied;
+ }
ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, commit_write_fn);
+ to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
- if (pos > inode->i_size)
- i_size_write(inode, pos);
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
- if (inode->i_size > EXT3_I(inode)->i_disksize) {
- EXT3_I(inode)->i_disksize = inode->i_size;
+
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+ if (inode->i_size > ei->i_disksize) {
+ ei->i_disksize = inode->i_size;
ret2 = ext3_mark_inode_dirty(handle, inode);
- if (!ret)
+ if (!ret)
ret = ret2;
}
+
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
- return ret;
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
+ return ret ? ret : copied;
}
-/*
+/*
* bmap() is special. It gets used by applications such as lilo and by
* the swapper to find the on-disk block of a specific piece of data.
*
@@ -1146,10 +1498,10 @@ static int ext3_journalled_commit_write(struct file *file,
* filesystem and enables swap, then they may get a nasty shock when the
* data getting swapped to that swapfile suddenly gets overwritten by
* the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
+ * awaiting writeback in the kernel's buffer cache.
*
* So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
+ * take extra steps to flush any blocks which might be in the cache.
*/
static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
{
@@ -1157,17 +1509,17 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
- if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
- /*
+ if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
+ /*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
* only if we run lilo or swapon on a freshly made file
- * do we expect this to happen.
+ * do we expect this to happen.
*
* (bmap requires CAP_SYS_RAWIO so this does not
* represent an unprivileged user DOS attack --- we'd be
* in trouble if mortal users could trigger this path at
- * will.)
+ * will.)
*
* NB. EXT3_STATE_JDATA is not set on files other than
* regular files. If somebody wants to bmap a directory
@@ -1176,7 +1528,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+ ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
journal = EXT3_JOURNAL(inode);
journal_lock_updates(journal);
err = journal_flush(journal);
@@ -1201,67 +1553,26 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
{
- if (buffer_mapped(bh))
- return ext3_journal_dirty_data(handle, bh);
- return 0;
+ return !buffer_mapped(bh);
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data. This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
* refers to old data.
*
* In all journalling modes block_write_full_page() will start the I/O.
*
- * Problem:
- *
- * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext3_writepage()
- *
- * Similar for:
- *
- * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block(). We will deadlock on various things like
- * lock_journal and i_truncate_sem.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
- *
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
- *
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
- *
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
- *
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct buffer_head *page_bufs;
@@ -1270,6 +1581,13 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
/*
* We give up here if we're reentered, because it might be for a
@@ -1278,6 +1596,20 @@ static int ext3_ordered_writepage(struct page *page,
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_ordered_writepage(page);
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ page_bufs = page_buffers(page);
+ } else {
+ page_bufs = page_buffers(page);
+ if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, buffer_unmapped)) {
+ /* Provide NULL get_block() to catch bugs if buffers
+ * weren't really mapped */
+ return block_write_full_page(page, NULL, wbc);
+ }
+ }
handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
@@ -1285,11 +1617,6 @@ static int ext3_ordered_writepage(struct page *page,
goto out_fail;
}
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- }
- page_bufs = page_buffers(page);
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bget_one);
@@ -1303,16 +1630,13 @@ static int ext3_ordered_writepage(struct page *page,
*/
/*
- * And attach them to the current transaction. But only if
+ * And attach them to the current transaction. But only if
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ if (ret == 0)
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
err = ext3_journal_stop(handle);
@@ -1334,19 +1658,35 @@ static int ext3_writeback_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_writeback_writepage(page);
+ if (page_has_buffers(page)) {
+ if (!walk_page_buffers(NULL, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+ /* Provide NULL get_block() to catch bugs if buffers
+ * weren't really mapped */
+ return block_write_full_page(page, NULL, wbc);
+ }
+ }
+
handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_fail;
}
- if (test_opt(inode->i_sb, NOBH))
- ret = nobh_writepage(page, ext3_get_block, wbc);
- else
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, ext3_get_block, wbc);
err = ext3_journal_stop(handle);
if (!ret)
@@ -1367,23 +1707,33 @@ static int ext3_journalled_writepage(struct page *page,
int ret = 0;
int err;
- if (ext3_journal_current_handle())
- goto no_write;
-
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
+ J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+ trace_ext3_journalled_writepage(page);
if (!page_has_buffers(page) || PageChecked(page)) {
+ if (ext3_journal_current_handle())
+ goto no_write;
+
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext3_get_block);
+ ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+ ext3_get_block);
if (ret != 0) {
ext3_journal_stop(handle);
goto out_unlock;
@@ -1392,22 +1742,25 @@ static int ext3_journalled_writepage(struct page *page,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, commit_write_fn);
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
if (ret == 0)
ret = err;
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+ atomic_set(&EXT3_I(inode)->i_datasync_tid,
+ handle->h_transaction->t_tid);
unlock_page(page);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
} else {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * It is a page full of checkpoint-mode buffers. Go and write
+ * them. They should have been already mapped when they went
+ * to the journal so provide NULL get_block function to catch
+ * errors.
*/
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, NULL, wbc);
}
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
out:
return ret;
@@ -1420,6 +1773,7 @@ out_unlock:
static int ext3_readpage(struct file *file, struct page *page)
{
+ trace_ext3_readpage(page);
return mpage_readpage(page, ext3_get_block);
}
@@ -1430,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static int ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- return journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_releasepage(page);
WARN_ON(PageChecked(page));
if (!page_has_buffers(page))
return 0;
@@ -1459,54 +1817,78 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* if the machine crashes during the write.
*
* If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct ext3_inode_info *ei = EXT3_I(inode);
- handle_t *handle = NULL;
+ handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
+ int retries = 0;
+
+ trace_ext3_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE) {
loff_t final_size = offset + count;
- handle = ext3_journal_start(inode, DIO_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext3_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
ret = ext3_orphan_add(handle, inode);
- if (ret)
- goto out_stop;
+ if (ret) {
+ ext3_journal_stop(handle);
+ goto out;
+ }
orphan = 1;
ei->i_disksize = inode->i_size;
+ ext3_journal_stop(handle);
}
}
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext3_direct_io_get_blocks, NULL);
-
+retry:
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
/*
- * Reacquire the handle: ext3_direct_io_get_block() can restart the
- * transaction
+ * In case of error extending write may have instantiated a few
+ * blocks outside i_size. Trim these off again.
*/
- handle = journal_current_handle();
+ if (unlikely((rw & WRITE) && ret < 0)) {
+ loff_t isize = i_size_read(inode);
+ loff_t end = offset + count;
-out_stop:
- if (handle) {
+ if (end > isize)
+ ext3_truncate_failed_direct_write(inode);
+ }
+ if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ if (orphan) {
int err;
- if (orphan && inode->i_nlink)
+ /* Credits for sb + inode write */
+ handle = ext3_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Truncate allocated blocks
+ * and pretend the write failed... */
+ ext3_truncate_failed_direct_write(inode);
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
+ goto out;
+ }
+ if (inode->i_nlink)
ext3_orphan_del(handle, inode);
- if (orphan && ret > 0) {
+ if (ret > 0) {
loff_t end = offset + ret;
if (end > inode->i_size) {
ei->i_disksize = end;
@@ -1526,6 +1908,7 @@ out_stop:
ret = err;
}
out:
+ trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -1548,43 +1931,49 @@ static int ext3_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static struct address_space_operations ext3_ordered_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_ordered_writepage,
- .sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_ordered_commit_write,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
+static const struct address_space_operations ext3_ordered_aops = {
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_ordered_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_ordered_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
+ .error_remove_page = generic_error_remove_page,
};
-static struct address_space_operations ext3_writeback_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_writeback_writepage,
- .sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_writeback_commit_write,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
+static const struct address_space_operations ext3_writeback_aops = {
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_writeback_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_writeback_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
-static struct address_space_operations ext3_journalled_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_journalled_writepage,
- .sync_page = block_sync_page,
- .prepare_write = ext3_prepare_write,
- .commit_write = ext3_journalled_commit_write,
- .set_page_dirty = ext3_journalled_set_page_dirty,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
+static const struct address_space_operations ext3_journalled_aops = {
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_journalled_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_journalled_write_end,
+ .set_page_dirty = ext3_journalled_set_page_dirty,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext3_set_aops(struct inode *inode)
@@ -1603,36 +1992,27 @@ void ext3_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
- struct address_space *mapping, loff_t from)
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
{
- unsigned long index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, length, pos;
- struct inode *inode = mapping->host;
+ struct page *page;
+ handle_t *handle = NULL;
struct buffer_head *bh;
int err = 0;
- void *kaddr;
+ /* Truncated on block boundary - nothing to do */
blocksize = inode->i_sb->s_blocksize;
+ if ((from & (blocksize - 1)) == 0)
+ return 0;
+
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page)
+ return -ENOMEM;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
- /*
- * For "nobh" option, we can only work if we don't need to
- * read-in the page - otherwise we create buffers to do the IO.
- */
- if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
- if (PageUptodate(page)) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- set_page_dirty(page);
- goto unlock;
- }
- }
-
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -1665,27 +2045,33 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err)
goto unlock;
}
+ /* data=writeback mode doesn't need transaction to zero-out data */
+ if (!ext3_should_writeback_data(inode)) {
+ /* We journal at most one block */
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ err = PTR_ERR(handle);
+ goto unlock;
+ }
+ }
+
if (ext3_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext3_journal_get_write_access(handle, bh);
if (err)
- goto unlock;
+ goto stop;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
-
+ zero_user(page, offset, length);
BUFFER_TRACE(bh, "zeroed end of block");
err = 0;
@@ -1696,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
err = ext3_journal_dirty_data(handle, bh);
mark_buffer_dirty(bh);
}
+stop:
+ if (handle)
+ ext3_journal_stop(handle);
unlock:
unlock_page(page);
@@ -1728,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
*
* When we do truncate() we may have to clean the ends of several
* indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
+ * partially truncated if some data below the new i_size is referred
* from it (and it is on the path to the first completely truncated
* data block, indeed). We have to free the top of that path along
* with everything to the right of the path. Since no allocation
@@ -1751,17 +2140,14 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
* c) free the subtrees growing from the inode past the @chain[0].
* (no partially truncated stuff there). */
-static Indirect *ext3_find_shared(struct inode *inode,
- int depth,
- int offsets[4],
- Indirect chain[4],
- __le32 *top)
+static Indirect *ext3_find_shared(struct inode *inode, int depth,
+ int offsets[4], Indirect chain[4], __le32 *top)
{
Indirect *partial, *p;
int k, err;
*top = 0;
- /* Make k index the deepest non-null offest + 1 */
+ /* Make k index the deepest non-null offset + 1 */
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
partial = ext3_get_branch(inode, k, offsets, chain, &err);
@@ -1794,8 +2180,7 @@ static Indirect *ext3_find_shared(struct inode *inode,
}
/* Writer: end */
- while(partial > p)
- {
+ while(partial > p) {
brelse(partial->bh);
partial--;
}
@@ -1811,22 +2196,23 @@ no_top:
* We release `count' blocks on disk, but (last - first) may be greater
* than `count' because there can be holes in there.
*/
-static void
-ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
- unsigned long block_to_free, unsigned long count,
- __le32 *first, __le32 *last)
+static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext3_fsblk_t block_to_free,
+ unsigned long count, __le32 *first, __le32 *last)
{
__le32 *p;
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, bh);
+ if (ext3_journal_dirty_metadata(handle, bh))
+ return;
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ext3_journal_get_write_access(handle, bh);
+ if (ext3_journal_get_write_access(handle, bh))
+ return;
}
}
@@ -1860,7 +2246,7 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
* @first: array of block numbers
* @last: points immediately past the end of array
*
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
* little-endian 32-bit) and updating @inode->i_blocks appropriately.
*
* We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -1875,12 +2261,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
struct buffer_head *this_bh,
__le32 *first, __le32 *last)
{
- unsigned long block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
+ ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
__le32 *block_to_free_p = NULL; /* Pointer into inode/ind
corresponding to
block_to_free */
- unsigned long nr; /* Current block # */
+ ext3_fsblk_t nr; /* Current block # */
__le32 *p; /* Pointer into inode/ind
for current block */
int err;
@@ -1905,7 +2291,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
} else if (nr == block_to_free + count) {
count++;
} else {
- ext3_clear_blocks(handle, inode, this_bh,
+ ext3_clear_blocks(handle, inode, this_bh,
block_to_free,
count, block_to_free_p, p);
block_to_free = nr;
@@ -1921,7 +2307,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext3_journal_dirty_metadata(handle, this_bh);
+ else
+ ext3_error(inode->i_sb, "ext3_free_data",
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)this_bh->b_blocknr);
}
}
@@ -1934,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
* @last: pointer immediately past the end of array
* @depth: depth of the branches to free
*
- * We are freeing all blocks refered from these branches (numbers are
+ * We are freeing all blocks referred from these branches (numbers are
* stored as little-endian 32-bit) and updating @inode->i_blocks
* appropriately.
*/
@@ -1942,7 +2342,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
struct buffer_head *parent_bh,
__le32 *first, __le32 *last, int depth)
{
- unsigned long nr;
+ ext3_fsblk_t nr;
__le32 *p;
if (is_handle_aborted(handle))
@@ -1966,7 +2366,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
*/
if (!bh) {
ext3_error(inode->i_sb, "ext3_free_branches",
- "Read failure, inode=%ld, block=%ld",
+ "Read failure, inode=%lu, block="E3FSBLK,
inode->i_ino, nr);
continue;
}
@@ -1979,27 +2379,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
depth);
/*
- * We've probably journalled the indirect block several
- * times during the truncate. But it's no longer
- * needed and we now drop it from the transaction via
- * journal_revoke().
- *
- * That's easy if it's exclusively part of this
- * transaction. But if it's part of the committing
- * transaction then journal_forget() will simply
- * brelse() it. That means that if the underlying
- * block is reallocated in ext3_get_block(),
- * unmap_underlying_metadata() will find this block
- * and will try to get rid of it. damn, damn.
- *
- * If this block has already been committed to the
- * journal, a revoke record will be written. And
- * revoke records must be emitted *before* clearing
- * this block's bit in the bitmaps.
- */
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-
- /*
* Everything below this this pointer has been
* released. Now let this top-of-subtree go.
*
@@ -2019,9 +2398,34 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
+ /*
+ * We've probably journalled the indirect block several
+ * times during the truncate. But it's no longer
+ * needed and we now drop it from the transaction via
+ * journal_revoke().
+ *
+ * That's easy if it's exclusively part of this
+ * transaction. But if it's part of the committing
+ * transaction then journal_forget() will simply
+ * brelse() it. That means that if the underlying
+ * block is reallocated in ext3_get_block(),
+ * unmap_underlying_metadata() will find this block
+ * and will try to get rid of it. damn, damn. Thus
+ * we don't allow a block to be reallocated until
+ * a transaction freeing it has fully committed.
+ *
+ * We also have to make sure journal replay after a
+ * crash does not overwrite non-journaled data blocks
+ * with old metadata when the block got reallocated for
+ * data. Thus we have to store a revoke record for a
+ * block in the same transaction in which we free the
+ * block.
+ */
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
ext3_free_blocks(handle, inode, nr, 1);
if (parent_bh) {
@@ -2035,7 +2439,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
*p = 0;
BUFFER_TRACE(parent_bh,
"call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle,
+ ext3_journal_dirty_metadata(handle,
parent_bh);
}
}
@@ -2047,6 +2451,17 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext3_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext3_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext3_truncate()
*
@@ -2054,7 +2469,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
* transaction, and VFS/VM ensures that ext3_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -2075,14 +2490,12 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
* that's fine - as long as they are linked from the inode, the post-crash
* ext3_truncate() run will find them and release them.
*/
-
-void ext3_truncate(struct inode * inode)
+void ext3_truncate(struct inode *inode)
{
handle_t *handle;
struct ext3_inode_info *ei = EXT3_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
int offsets[4];
Indirect chain[4];
Indirect *partial;
@@ -2090,47 +2503,21 @@ void ext3_truncate(struct inode * inode)
int n;
long last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext3_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
+ trace_ext3_truncate_enter(inode);
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
+ if (!ext3_can_truncate(inode))
+ goto out_notrans;
+
+ if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+ ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
- return; /* AKPM: return what? */
- }
+ if (IS_ERR(handle))
+ goto out_notrans;
last_block = (inode->i_size + blocksize-1)
>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-
- if (page)
- ext3_block_truncate_page(handle, page, mapping, inode->i_size);
-
n = ext3_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
goto out_stop; /* error */
@@ -2160,7 +2547,7 @@ void ext3_truncate(struct inode * inode)
* From here we block out all ext3_get_block() callers who want to
* modify the block allocation tree.
*/
- down(&ei->truncate_sem);
+ mutex_lock(&ei->truncate_mutex);
if (n == 1) { /* direct blocks */
ext3_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2182,7 +2569,6 @@ void ext3_truncate(struct inode * inode)
*/
} else {
/* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
ext3_free_branches(handle, inode, partial->bh,
partial->p,
partial->p+1, (chain+n-1) - partial);
@@ -2200,39 +2586,38 @@ void ext3_truncate(struct inode * inode)
do_indirects:
/* Kill the remaining (whole) subtrees */
switch (offsets[0]) {
- default:
- nr = i_data[EXT3_IND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL,
- &nr, &nr+1, 1);
- i_data[EXT3_IND_BLOCK] = 0;
- }
- case EXT3_IND_BLOCK:
- nr = i_data[EXT3_DIND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL,
- &nr, &nr+1, 2);
- i_data[EXT3_DIND_BLOCK] = 0;
- }
- case EXT3_DIND_BLOCK:
- nr = i_data[EXT3_TIND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL,
- &nr, &nr+1, 3);
- i_data[EXT3_TIND_BLOCK] = 0;
- }
- case EXT3_TIND_BLOCK:
- ;
+ default:
+ nr = i_data[EXT3_IND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+ i_data[EXT3_IND_BLOCK] = 0;
+ }
+ case EXT3_IND_BLOCK:
+ nr = i_data[EXT3_DIND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+ i_data[EXT3_DIND_BLOCK] = 0;
+ }
+ case EXT3_DIND_BLOCK:
+ nr = i_data[EXT3_TIND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+ i_data[EXT3_TIND_BLOCK] = 0;
+ }
+ case EXT3_TIND_BLOCK:
+ ;
}
ext3_discard_reservation(inode);
- up(&ei->truncate_sem);
+ mutex_unlock(&ei->truncate_mutex);
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
- /* In a multi-transaction truncate, we only make the final
- * transaction synchronous */
+ /*
+ * In a multi-transaction truncate, we only make the final transaction
+ * synchronous
+ */
if (IS_SYNC(inode))
handle->h_sync = 1;
out_stop:
@@ -2240,57 +2625,52 @@ out_stop:
* If this was a simple ftruncate(), and the file will remain alive
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
- * ext3_delete_inode(), and we allow that function to clean up the
+ * ext3_evict_inode(), and we allow that function to clean up the
* orphan info for us.
*/
if (inode->i_nlink)
ext3_orphan_del(handle, inode);
ext3_journal_stop(handle);
+ trace_ext3_truncate_exit(inode);
+ return;
+out_notrans:
+ /*
+ * Delete the inode from orphan list so that it doesn't stay there
+ * forever and trigger assertion on umount.
+ */
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
+ trace_ext3_truncate_exit(inode);
}
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
unsigned long ino, struct ext3_iloc *iloc)
{
- unsigned long desc, group_desc, block_group;
- unsigned long offset, block;
- struct buffer_head *bh;
- struct ext3_group_desc * gdp;
-
+ unsigned long block_group;
+ unsigned long offset;
+ ext3_fsblk_t block;
+ struct ext3_group_desc *gdp;
- if ((ino != EXT3_ROOT_INO &&
- ino != EXT3_JOURNAL_INO &&
- ino != EXT3_RESIZE_INO &&
- ino < EXT3_FIRST_INO(sb)) ||
- ino > le32_to_cpu(
- EXT3_SB(sb)->s_es->s_inodes_count)) {
- ext3_error (sb, "ext3_get_inode_block",
- "bad inode number: %lu", ino);
+ if (!ext3_valid_inum(sb, ino)) {
+ /*
+ * This error is already checked for in namei.c unless we are
+ * looking at an NFS filehandle, in which case no error
+ * report is needed
+ */
return 0;
}
+
block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
- if (block_group >= EXT3_SB(sb)->s_groups_count) {
- ext3_error (sb, "ext3_get_inode_block",
- "group >= groups count");
- return 0;
- }
- smp_rmb();
- group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
- desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
- bh = EXT3_SB(sb)->s_group_desc[group_desc];
- if (!bh) {
- ext3_error (sb, "ext3_get_inode_block",
- "Descriptor not loaded");
+ gdp = ext3_get_group_desc(sb, block_group, NULL);
+ if (!gdp)
return 0;
- }
-
- gdp = (struct ext3_group_desc *) bh->b_data;
/*
* Figure out the offset within the block group inode table
*/
offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
EXT3_INODE_SIZE(sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
+ block = le32_to_cpu(gdp->bg_inode_table) +
(offset >> EXT3_BLOCK_SIZE_BITS(sb));
iloc->block_group = block_group;
@@ -2307,7 +2687,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
static int __ext3_get_inode_loc(struct inode *inode,
struct ext3_iloc *iloc, int in_mem)
{
- unsigned long block;
+ ext3_fsblk_t block;
struct buffer_head *bh;
block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2315,14 +2695,25 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
- "inode=%lu, block=%lu", inode->i_ino, block);
- return -EIO;
+ "inode=%lu, block="E3FSBLK,
+ inode->i_ino, block);
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
@@ -2358,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -2392,14 +2783,15 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext3_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext3_error(inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
- "inode=%lu, block=%lu",
+ "inode=%lu, block="E3FSBLK,
inode->i_ino, block);
brelse(bh);
return -EIO;
@@ -2414,7 +2806,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext3_get_inode_loc(inode, iloc,
- !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+ !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
}
void ext3_set_inode_flags(struct inode *inode)
@@ -2434,39 +2826,70 @@ void ext3_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
-void ext3_read_inode(struct inode * inode)
+/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
+void ext3_get_inode_flags(struct ext3_inode_info *ei)
+{
+ unsigned int flags = ei->vfs_inode.i_flags;
+
+ ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
+ EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
+ if (flags & S_SYNC)
+ ei->i_flags |= EXT3_SYNC_FL;
+ if (flags & S_APPEND)
+ ei->i_flags |= EXT3_APPEND_FL;
+ if (flags & S_IMMUTABLE)
+ ei->i_flags |= EXT3_IMMUTABLE_FL;
+ if (flags & S_NOATIME)
+ ei->i_flags |= EXT3_NOATIME_FL;
+ if (flags & S_DIRSYNC)
+ ei->i_flags |= EXT3_DIRSYNC_FL;
+}
+
+struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
{
struct ext3_iloc iloc;
struct ext3_inode *raw_inode;
- struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_inode_info *ei;
struct buffer_head *bh;
+ struct inode *inode;
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+ transaction_t *transaction;
+ long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- ei->i_acl = EXT3_ACL_NOT_CACHED;
- ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ ei = EXT3_I(inode);
ei->i_block_alloc_info = NULL;
- if (__ext3_get_inode_loc(inode, &iloc, 0))
+ ret = __ext3_get_inode_loc(inode, &iloc, 0);
+ if (ret < 0)
goto bad_inode;
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+ inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+ inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+ inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -2479,6 +2902,7 @@ void ext3_read_inode(struct inode * inode)
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
/* this inode is deleted */
brelse (bh);
+ ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
@@ -2486,9 +2910,6 @@ void ext3_read_inode(struct inode * inode)
* recovery code: that's fine, we're about to complete
* the process of deleting those. */
}
- inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
- * (for stat), not the fs block
- * size */
inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
#ifdef EXT3_FRAGMENTS
@@ -2514,6 +2935,30 @@ void ext3_read_inode(struct inode * inode)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ atomic_set(&ei->i_sync_tid, tid);
+ atomic_set(&ei->i_datasync_tid, tid);
+ }
+
if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
/*
@@ -2523,8 +2968,11 @@ void ext3_read_inode(struct inode * inode)
*/
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT3_INODE_SIZE(inode->i_sb))
+ EXT3_INODE_SIZE(inode->i_sb)) {
+ brelse (bh);
+ ret = -EIO;
goto bad_inode;
+ }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext3_inode) -
@@ -2534,7 +2982,7 @@ void ext3_read_inode(struct inode * inode)
EXT3_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
- ei->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -2547,9 +2995,11 @@ void ext3_read_inode(struct inode * inode)
inode->i_op = &ext3_dir_inode_operations;
inode->i_fop = &ext3_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext3_inode_is_fast_symlink(inode))
+ if (ext3_inode_is_fast_symlink(inode)) {
inode->i_op = &ext3_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
}
@@ -2558,17 +3008,18 @@ void ext3_read_inode(struct inode * inode)
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
- else
+ else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
}
brelse (iloc.bh);
ext3_set_inode_flags(inode);
- return;
+ unlock_new_inode(inode);
+ return inode;
bad_inode:
- make_bad_inode(inode);
- return;
+ iget_failed(inode);
+ return ERR_PTR(ret);
}
/*
@@ -2578,47 +3029,62 @@ bad_inode:
*
* The caller must have write access to iloc->bh.
*/
-static int ext3_do_update_inode(handle_t *handle,
- struct inode *inode,
+static int ext3_do_update_inode(handle_t *handle,
+ struct inode *inode,
struct ext3_iloc *iloc)
{
struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
struct ext3_inode_info *ei = EXT3_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
+ __le32 disksize;
+ uid_t i_uid;
+ gid_t i_gid;
+
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT3_STATE_NEW)
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
+ ext3_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if(!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ cpu_to_le16(fs_high2lowuid(i_uid));
raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+ disksize = cpu_to_le32(ei->i_disksize);
+ if (disksize != raw_inode->i_size) {
+ need_datasync = 1;
+ raw_inode->i_size = disksize;
+ }
raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -2634,8 +3100,11 @@ static int ext3_do_update_inode(handle_t *handle,
if (!S_ISREG(inode->i_mode)) {
raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
} else {
- raw_inode->i_size_high =
- cpu_to_le32(ei->i_disksize >> 32);
+ disksize = cpu_to_le32(ei->i_disksize >> 32);
+ if (disksize != raw_inode->i_size_high) {
+ raw_inode->i_size_high = disksize;
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -2645,17 +3114,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2678,11 +3150,15 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+ atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+ if (need_datasync)
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
out_brelse:
brelse (bh);
ext3_std_error(inode->i_sb, err);
@@ -2694,21 +3170,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -2720,22 +3195,27 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (ext3_journal_current_handle()) {
- jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
- if (!wait)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext3_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
return ext3_force_commit(inode->i_sb);
@@ -2754,7 +3234,7 @@ int ext3_write_inode(struct inode *inode, int wait)
* commit will leave the blocks being flushed in an unused state on
* disk. (On recovery, the inode will get truncated and the blocks will
* be freed, so we have a strong guarantee that no future commit will
- * leave these blocks visible to the user.)
+ * leave these blocks visible to the user.)
*
* Called with inode->sem down.
*/
@@ -2768,19 +3248,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext3_journal_stop(handle);
return error;
@@ -2795,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
ext3_journal_stop(handle);
}
+ if (attr->ia_valid & ATTR_SIZE)
+ inode_dio_wait(inode);
+
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
handle_t *handle;
@@ -2806,23 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
}
error = ext3_orphan_add(handle, inode);
+ if (error) {
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
EXT3_I(inode)->i_disksize = attr->ia_size;
- rc = ext3_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+ error = ext3_mark_inode_dirty(handle, inode);
ext3_journal_stop(handle);
+ if (error) {
+ /* Some hard fs error must have happened. Bail out. */
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ rc = ext3_block_truncate_page(inode, attr->ia_size);
+ if (rc) {
+ /* Cleanup orphan list and exit */
+ handle = ext3_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext3_orphan_del(handle, inode);
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
}
- rc = inode_setattr(inode, attr);
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ ext3_truncate(inode);
+ }
- /* If inode_setattr's call to ext3_truncate failed to get a
- * transaction handle at all, we need to clean up the in-core
- * orphan list manually. */
- if (inode->i_nlink)
- ext3_orphan_del(NULL, inode);
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
- if (!rc && (ia_valid & ATTR_MODE))
- rc = ext3_acl_chmod(inode);
+ if (ia_valid & ATTR_MODE)
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext3_std_error(inode->i_sb, error);
@@ -2833,7 +3338,7 @@ err_out:
/*
- * akpm: how many blocks doth make a writepage()?
+ * How many blocks doth make a writepage()?
*
* With N blocks per page, it may be:
* N data blocks
@@ -2868,12 +3373,12 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
if (ext3_should_journal_data(inode))
ret = 3 * (bpp + indirects) + 2;
else
- ret = 2 * (bpp + indirects) + 2;
+ ret = 2 * (bpp + indirects) + indirects + 2;
#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during DQUOT_INIT so
+ /* We know that structure was already allocated during dquot_initialize so
* we will be updating only the data blocks + inodes */
- ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
#endif
return ret;
@@ -2897,13 +3402,13 @@ int ext3_mark_iloc_dirty(handle_t *handle,
return err;
}
-/*
+/*
* On success, We end up with an outstanding reference count against
- * iloc->bh. This _must_ be cleaned up later.
+ * iloc->bh. This _must_ be cleaned up later.
*/
int
-ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext3_iloc *iloc)
{
int err = 0;
@@ -2923,8 +3428,8 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
}
/*
- * akpm: What we do here is to mark the in-core inode as clean
- * with respect to inode dirtiness (it may still be data-dirty).
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
* This means that the in-core inode may be reaped by prune_icache
* without having to perform any I/O. This is a very good thing,
* because *any* task may call prune_icache - even ones which
@@ -2934,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -2949,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err;
might_sleep();
+ trace_ext3_mark_inode_dirty(inode, _RET_IP_);
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (!err)
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
@@ -2956,20 +3454,20 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
}
/*
- * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
+ * ext3_dirty_inode() is called from __mark_inode_dirty()
*
* We're really interested in the case where a file is being extended.
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
{
handle_t *current_handle = ext3_journal_current_handle();
handle_t *handle;
@@ -2981,7 +3479,7 @@ void ext3_dirty_inode(struct inode *inode)
current_handle->h_transaction != handle->h_transaction) {
/* This task has a transaction open against a different fs */
printk(KERN_EMERG "%s: transactions do not match!\n",
- __FUNCTION__);
+ __func__);
} else {
jbd_debug(5, "marking dirty. outer handle=%p\n",
current_handle);
@@ -2992,16 +3490,15 @@ out:
return;
}
-#ifdef AKPM
-/*
+#if 0
+/*
* Bind an inode's backing buffer_head into this transaction, to prevent
* it from being flushed to disk early. Unlike
* ext3_reserve_inode_write, this leaves behind no bh reference and
* returns no iloc structure, so the caller needs to repeat the iloc
* lookup to mark the inode dirty later.
*/
-static inline int
-ext3_pin_inode(handle_t *handle, struct inode *inode)
+static int ext3_pin_inode(handle_t *handle, struct inode *inode)
{
struct ext3_iloc iloc;
@@ -3012,7 +3509,7 @@ ext3_pin_inode(handle_t *handle, struct inode *inode)
BUFFER_TRACE(iloc.bh, "get_write_access");
err = journal_get_write_access(handle, iloc.bh);
if (!err)
- err = ext3_journal_dirty_metadata(handle,
+ err = ext3_journal_dirty_metadata(handle,
iloc.bh);
brelse(iloc.bh);
}
@@ -3039,7 +3536,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val)
*/
journal = EXT3_JOURNAL(inode);
- if (is_journal_aborted(journal) || IS_RDONLY(inode))
+ if (is_journal_aborted(journal))
return -EROFS;
journal_lock_updates(journal);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 556cd551007..4d96e9a6453 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -7,18 +7,14 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/capability.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/time.h>
+#include <linux/mount.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
+#include "ext3.h"
-
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
- unsigned long arg)
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = file_inode(filp);
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
@@ -27,6 +23,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
switch (cmd) {
case EXT3_IOC_GETFLAGS:
+ ext3_get_inode_flags(ei);
flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT3_IOC_SETFLAGS: {
@@ -36,17 +33,24 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT3_DIRSYNC_FL;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ flags = ext3_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* Is it quota file? Do not allow user to mess with it */
+ err = -EPERM;
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
oldflags = ei->i_flags;
@@ -61,7 +65,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
*/
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
+ goto flags_out;
}
/*
@@ -70,13 +74,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
*/
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
+ goto flags_out;
}
-
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto flags_out;
+ }
if (IS_SYNC(inode))
handle->h_sync = 1;
err = ext3_reserve_inode_write(handle, inode, &iloc);
@@ -94,10 +99,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
flags_err:
ext3_journal_stop(handle);
if (err)
- return err;
+ goto flags_out;
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
+flags_out:
+ mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -110,16 +118,23 @@ flags_err:
__u32 generation;
int err;
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!inode_owner_or_capable(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
+
+ mutex_lock(&inode->i_mutex);
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto unlock_out;
+ }
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = CURRENT_TIME_SEC;
@@ -127,32 +142,13 @@ flags_err:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
+setversion_out:
+ mnt_drop_write_file(filp);
return err;
}
-#ifdef CONFIG_JBD_DEBUG
- case EXT3_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- return ret;
- }
-#endif
case EXT3_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
@@ -162,18 +158,24 @@ flags_err:
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
- return -EACCES;
+ if (!inode_owner_or_capable(inode)) {
+ err = -EACCES;
+ goto setrsvsz_out;
+ }
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
+ if (get_user(rsv_window_size, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setrsvsz_out;
+ }
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -182,7 +184,7 @@ flags_err:
* need to allocate reservation structure for this inode
* before set the window size
*/
- down(&ei->truncate_sem);
+ mutex_lock(&ei->truncate_mutex);
if (!ei->i_block_alloc_info)
ext3_init_block_alloc_info(inode);
@@ -190,55 +192,136 @@ flags_err:
struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
rsv->rsv_goal_size = rsv_window_size;
}
- up(&ei->truncate_sem);
- return 0;
+ mutex_unlock(&ei->truncate_mutex);
+setrsvsz_out:
+ mnt_drop_write_file(filp);
+ return err;
}
case EXT3_IOC_GROUP_EXTEND: {
- unsigned long n_blocks_count;
+ ext3_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
- int err;
+ int err, err2;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
journal_lock_updates(EXT3_SB(sb)->s_journal);
- journal_flush(EXT3_SB(sb)->s_journal);
+ err2 = journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+ if (err == 0)
+ err = err2;
+group_extend_out:
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GROUP_ADD: {
struct ext3_new_group_data input;
struct super_block *sb = inode->i_sb;
- int err;
+ int err, err2;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
err = ext3_group_add(sb, &input);
journal_lock_updates(EXT3_SB(sb)->s_journal);
- journal_flush(EXT3_SB(sb)->s_journal);
+ err2 = journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+ if (err == 0)
+ err = err2;
+group_add_out:
+ mnt_drop_write_file(filp);
return err;
}
+ case FITRIM: {
+
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ext3_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
default:
return -ENOTTY;
}
}
+
+#ifdef CONFIG_COMPAT
+long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case EXT3_IOC32_GETFLAGS:
+ cmd = EXT3_IOC_GETFLAGS;
+ break;
+ case EXT3_IOC32_SETFLAGS:
+ cmd = EXT3_IOC_SETFLAGS;
+ break;
+ case EXT3_IOC32_GETVERSION:
+ cmd = EXT3_IOC_GETVERSION;
+ break;
+ case EXT3_IOC32_SETVERSION:
+ cmd = EXT3_IOC_SETVERSION;
+ break;
+ case EXT3_IOC32_GROUP_EXTEND:
+ cmd = EXT3_IOC_GROUP_EXTEND;
+ break;
+ case EXT3_IOC32_GETVERSION_OLD:
+ cmd = EXT3_IOC_GETVERSION_OLD;
+ break;
+ case EXT3_IOC32_SETVERSION_OLD:
+ cmd = EXT3_IOC_SETVERSION_OLD;
+ break;
+#ifdef CONFIG_JBD_DEBUG
+ case EXT3_IOC32_WAIT_FOR_READONLY:
+ cmd = EXT3_IOC_WAIT_FOR_READONLY;
+ break;
+#endif
+ case EXT3_IOC32_GETRSVSZ:
+ cmd = EXT3_IOC_GETRSVSZ;
+ break;
+ case EXT3_IOC32_SETRSVSZ:
+ cmd = EXT3_IOC_SETRSVSZ;
+ break;
+ case EXT3_IOC_GROUP_ADD:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8bd8ac07770..f197736dccf 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -15,28 +15,17 @@
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
* Directory entry file type support and forward compatibility hooks
- * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
* Hash Tree Directory indexing (c)
- * Daniel Phillips, 2001
+ * Daniel Phillips, 2001
* Hash Tree Directory indexing porting
- * Christopher Li, 2002
+ * Christopher Li, 2002
* Hash Tree Directory indexing cleanup
- * Theodore Ts'o, 2002
+ * Theodore Ts'o, 2002
*/
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/jbd.h>
-#include <linux/time.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
-
+#include "ext3.h"
#include "namei.h"
#include "xattr.h"
#include "acl.h"
@@ -47,7 +36,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext3_append(handle_t *handle,
struct inode *inode,
@@ -57,10 +45,14 @@ static struct buffer_head *ext3_append(handle_t *handle,
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+ if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
inode->i_size += inode->i_sb->s_blocksize;
EXT3_I(inode)->i_disksize = inode->i_size;
- ext3_journal_get_write_access(handle,bh);
+ *err = ext3_journal_get_write_access(handle, bh);
+ if (*err) {
+ brelse(bh);
+ bh = NULL;
+ }
}
return bh;
}
@@ -69,14 +61,10 @@ static struct buffer_head *ext3_append(handle_t *handle,
#define assert(test) J_ASSERT(test)
#endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
-#define dxtrace(command)
+#define dxtrace(command)
#endif
struct fake_dirent
@@ -140,10 +128,10 @@ struct dx_frame
struct dx_map_entry
{
u32 hash;
- u32 offs;
+ u16 offs;
+ u16 size;
};
-#ifdef CONFIG_EXT3_INDEX
static inline unsigned dx_get_block (struct dx_entry *entry);
static void dx_set_block (struct dx_entry *entry, unsigned value);
static inline unsigned dx_get_hash (struct dx_entry *entry);
@@ -154,29 +142,40 @@ static void dx_set_count (struct dx_entry *entries, unsigned value);
static void dx_set_limit (struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static struct dx_frame *dx_probe(struct qstr *entry,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame,
int *err);
static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
static int ext3_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
- struct dx_frame *frames,
+ struct dx_frame *frames,
__u32 *start_hash);
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
- struct ext3_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+ struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+ int *err);
static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext3_dir_entry_2 *
+ext3_next_entry(struct ext3_dir_entry_2 *p)
+{
+ return (struct ext3_dir_entry_2 *)((char *)p +
+ ext3_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -225,13 +224,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
EXT3_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -250,7 +249,7 @@ static void dx_show_index (char * label, struct dx_entry *entries)
}
struct stats
-{
+{
unsigned names;
unsigned space;
unsigned bcount;
@@ -275,12 +274,12 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
while (len--) printk("%c", *name++);
ext3fs_dirhash(de->name, de->name_len, &h);
printk(":%x.%u ", h.hash,
- ((char *) de - base));
+ (unsigned) ((char *) de - base));
}
space += EXT3_DIR_REC_LEN(de->name_len);
- names++;
+ names++;
}
- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+ de = ext3_next_entry(de);
}
printk("(%i)\n", names);
return (struct stats) { names, space, 1 };
@@ -327,7 +326,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
* back to userspace.
*/
static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(struct qstr *entry, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
unsigned count, indirect;
@@ -338,15 +337,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (dentry)
- dir = dentry->d_parent->d_inode;
- if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
+ *err = ERR_BAD_DX_DIR;
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
root->info.hash_version != DX_HASH_LEGACY) {
- ext3_warning(dir->i_sb, __FUNCTION__,
+ ext3_warning(dir->i_sb, __func__,
"Unrecognised inode hash code %d",
root->info.hash_version);
brelse(bh);
@@ -354,13 +353,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
goto fail;
}
hinfo->hash_version = root->info.hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
- if (dentry)
- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+ if (entry)
+ ext3fs_dirhash(entry->name, entry->len, hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
- ext3_warning(dir->i_sb, __FUNCTION__,
+ ext3_warning(dir->i_sb, __func__,
"Unimplemented inode hash flags: %#06x",
root->info.unused_flags);
brelse(bh);
@@ -369,7 +370,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
}
if ((indirect = root->info.indirect_levels) > 1) {
- ext3_warning(dir->i_sb, __FUNCTION__,
+ ext3_warning(dir->i_sb, __func__,
"Unimplemented inode hash depth: %#06x",
root->info.indirect_levels);
brelse(bh);
@@ -379,13 +380,28 @@ dx_probe(struct dentry *dentry, struct inode *dir,
entries = (struct dx_entry *) (((char *)&root->info) +
root->info.info_length);
- assert(dx_get_limit(entries) == dx_root_limit(dir,
- root->info.info_length));
+
+ if (dx_get_limit(entries) != dx_root_limit(dir,
+ root->info.info_length)) {
+ ext3_warning(dir->i_sb, __func__,
+ "dx entry: limit != root limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
dxtrace (printk("Look up %x", hash));
while (1)
{
count = dx_get_count(entries);
- assert (count && count <= dx_get_limit(entries));
+ if (!count || count > dx_get_limit(entries)) {
+ ext3_warning(dir->i_sb, __func__,
+ "dx entry: no count or count > limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail2;
+ }
+
p = entries + 1;
q = entries + count - 1;
while (p <= q)
@@ -420,11 +436,20 @@ dx_probe(struct dentry *dentry, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
+ *err = ERR_BAD_DX_DIR;
goto fail2;
+ }
at = entries = ((struct dx_node *) bh->b_data)->entries;
- assert (dx_get_limit(entries) == dx_node_limit (dir));
+ if (dx_get_limit(entries) != dx_node_limit (dir)) {
+ ext3_warning(dir->i_sb, __func__,
+ "dx entry: limit != node limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail2;
+ }
frame++;
+ frame->bh = NULL;
}
fail2:
while (frame >= frame_in) {
@@ -432,6 +457,10 @@ fail2:
frame--;
}
fail:
+ if (*err == ERR_BAD_DX_DIR)
+ ext3_warning(dir->i_sb, __func__,
+ "Corrupt dir inode %ld, running e2fsck is "
+ "recommended.", dir->i_ino);
return NULL;
}
@@ -464,7 +493,7 @@ static void dx_release (struct dx_frame *frames)
*/
static int ext3_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
- struct dx_frame *frames,
+ struct dx_frame *frames,
__u32 *start_hash)
{
struct dx_frame *p;
@@ -508,8 +537,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
+ 0, &err)))
return err; /* Failure */
p++;
brelse (p->bh);
@@ -521,14 +550,6 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
-{
- return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -540,10 +561,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext3_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
- if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
+
+ if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
return err;
de = (struct ext3_dir_entry_2 *) bh->b_data;
@@ -551,6 +573,12 @@ static int htree_dirblock_to_tree(struct file *dir_file,
dir->i_sb->s_blocksize -
EXT3_DIR_REC_LEN(0));
for (; de < top; de = ext3_next_entry(de)) {
+ if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+ (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+ +((char *)de - bh->b_data))) {
+ /* silently ignore the rest of the block */
+ break;
+ }
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
@@ -592,9 +620,12 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
start_minor_hash));
- dir = dir_file->f_dentry->d_inode;
+ dir = file_inode(dir_file);
if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
@@ -603,7 +634,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+ frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
if (!frame)
return err;
@@ -632,7 +663,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
count += ret;
hashval = ~0;
- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
frame, frames, &hashval);
*next_hash = hashval;
if (ret < 0) {
@@ -649,7 +680,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
break;
}
dx_release(frames);
- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
count, *next_hash));
return count;
errout:
@@ -662,29 +693,35 @@ errout:
* Directory block splitting, compacting
*/
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
{
int count = 0;
char *base = (char *) de;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + size)
+ while ((char *) de < base + blocksize)
{
if (de->name_len && de->inode) {
ext3fs_dirhash(de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
- map_tail->offs = (u32) ((char *) de - base);
+ map_tail->offs = (u16) ((char *) de - base);
+ map_tail->size = le16_to_cpu(de->rec_len);
count++;
cond_resched();
}
/* XXX: do we need to check rec_len == 0 case? -Chris */
- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+ de = ext3_next_entry(de);
}
return count;
}
+/* Sort map by hash value */
static void dx_sort_map (struct dx_map_entry *map, unsigned count)
{
struct dx_map_entry *p, *q, *top = map + count - 1;
@@ -726,8 +763,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
dx_set_block(new, block);
dx_set_count(entries, count + 1);
}
-#endif
-
static void ext3_update_dx_flag(struct inode *inode)
{
@@ -757,15 +792,15 @@ static inline int ext3_match (int len, const char * const name,
*/
static inline int search_dirblock(struct buffer_head * bh,
struct inode *dir,
- struct dentry *dentry,
+ struct qstr *child,
unsigned long offset,
struct ext3_dir_entry_2 ** res_dir)
{
struct ext3_dir_entry_2 * de;
char * dlimit;
int de_len;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
+ const char *name = child->name;
+ int namelen = child->len;
de = (struct ext3_dir_entry_2 *) bh->b_data;
dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -783,7 +818,7 @@ static inline int search_dirblock(struct buffer_head * bh,
return 1;
}
/* prevent looping on a bad block */
- de_len = le16_to_cpu(de->rec_len);
+ de_len = ext3_rec_len_from_disk(de->rec_len);
if (de_len <= 0)
return -1;
offset += de_len;
@@ -804,34 +839,40 @@ static inline int search_dirblock(struct buffer_head * bh,
* The returned buffer_head has ->b_count elevated. The caller is expected
* to brelse() it when appropriate.
*/
-static struct buffer_head * ext3_find_entry (struct dentry *dentry,
- struct ext3_dir_entry_2 ** res_dir)
+static struct buffer_head *ext3_find_entry(struct inode *dir,
+ struct qstr *entry,
+ struct ext3_dir_entry_2 **res_dir)
{
struct super_block * sb;
struct buffer_head * bh_use[NAMEI_RA_SIZE];
struct buffer_head * bh, *ret = NULL;
unsigned long start, block, b;
+ const u8 *name = entry->name;
int ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
int ra_ptr = 0; /* Current index into readahead
buffer */
int num = 0;
int nblocks, i, err;
- struct inode *dir = dentry->d_parent->d_inode;
int namelen;
- const u8 *name;
- unsigned blocksize;
*res_dir = NULL;
sb = dir->i_sb;
- blocksize = sb->s_blocksize;
- namelen = dentry->d_name.len;
- name = dentry->d_name.name;
+ namelen = entry->len;
if (namelen > EXT3_NAME_LEN)
return NULL;
-#ifdef CONFIG_EXT3_INDEX
+ if ((namelen <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == 0)) {
+ /*
+ * "." or ".." will only be in the first block
+ * NFS may look up ".."; "." should be handled by the VFS
+ */
+ block = start = 0;
+ nblocks = 1;
+ goto restart;
+ }
if (is_dx(dir)) {
- bh = ext3_dx_find_entry(dentry, res_dir, &err);
+ bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -841,7 +882,6 @@ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
return bh;
dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
}
-#endif
nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
start = EXT3_I(dir)->i_dir_start_lookup;
if (start >= nblocks)
@@ -869,8 +909,12 @@ restart:
num++;
bh = ext3_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
- if (bh)
- ll_rw_block(READ, 1, &bh);
+ if (bh && !bh_uptodate_or_lock(bh)) {
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO,
+ bh);
+ }
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -878,12 +922,12 @@ restart:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* read error, skip block & hope for the best */
- ext3_error(sb, __FUNCTION__, "reading directory #%lu "
+ ext3_error(sb, __func__, "reading directory #%lu "
"offset %lu", dir->i_ino, block);
brelse(bh);
goto next;
}
- i = search_dirblock(bh, dir, dentry,
+ i = search_dirblock(bh, dir, entry,
block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT3_I(dir)->i_dir_start_lookup = block;
@@ -917,60 +961,42 @@ cleanup_and_exit:
return ret;
}
-#ifdef CONFIG_EXT3_INDEX
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
- struct ext3_dir_entry_2 **res_dir, int *err)
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+ struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+ int *err)
{
- struct super_block * sb;
+ struct super_block *sb = dir->i_sb;
struct dx_hash_info hinfo;
- u32 hash;
struct dx_frame frames[2], *frame;
- struct ext3_dir_entry_2 *de, *top;
struct buffer_head *bh;
unsigned long block;
int retval;
- int namelen = dentry->d_name.len;
- const u8 *name = dentry->d_name.name;
- struct inode *dir = dentry->d_parent->d_inode;
- sb = dir->i_sb;
- /* NFS may look up ".." - look at dx_root directory block */
- if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
- return NULL;
- } else {
- frame = frames;
- frame->bh = NULL; /* for dx_release() */
- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
- dx_set_block(frame->at, 0); /* dx_root block is 0 */
- }
- hash = hinfo.hash;
+ if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
+ return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
goto errout;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
- EXT3_DIR_REC_LEN(0));
- for (; de < top; de = ext3_next_entry(de))
- if (ext3_match (namelen, name, de)) {
- if (!ext3_check_dir_entry("ext3_find_entry",
- dir, de, bh,
- (block<<EXT3_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
- goto errout;
- }
- *res_dir = de;
- dx_release (frames);
+
+ retval = search_dirblock(bh, dir, entry,
+ block << EXT3_BLOCK_SIZE_BITS(sb),
+ res_dir);
+ if (retval == 1) {
+ dx_release(frames);
return bh;
}
- brelse (bh);
+ brelse(bh);
+ if (retval == -1) {
+ *err = ERR_BAD_DX_DIR;
+ goto errout;
+ }
+
/* Check to see if we should continue to search */
- retval = ext3_htree_next_block(dir, hash, frame,
+ retval = ext3_htree_next_block(dir, hinfo.hash, frame,
frames, NULL);
if (retval < 0) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"error reading index page in directory #%lu",
dir->i_ino);
*err = retval;
@@ -980,13 +1006,12 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
*err = -ENOENT;
errout:
- dxtrace(printk("%s not found\n", name));
+ dxtrace(printk("%s not found\n", entry->name));
dx_release (frames);
return NULL;
}
-#endif
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
{
struct inode * inode;
struct ext3_dir_entry_2 * de;
@@ -995,15 +1020,23 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
if (dentry->d_name.len > EXT3_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext3_find_entry(dentry, &de);
+ bh = ext3_find_entry(dir, &dentry->d_name, &de);
inode = NULL;
if (bh) {
unsigned long ino = le32_to_cpu(de->inode);
brelse (bh);
- inode = iget(dir->i_sb, ino);
-
- if (!inode)
- return ERR_PTR(-EACCES);
+ if (!ext3_valid_inum(dir->i_sb, ino)) {
+ ext3_error(dir->i_sb, "ext3_lookup",
+ "bad inode number: %lu", ino);
+ return ERR_PTR(-EIO);
+ }
+ inode = ext3_iget(dir->i_sb, ino);
+ if (inode == ERR_PTR(-ESTALE)) {
+ ext3_error(dir->i_sb, __func__,
+ "deleted inode referenced: %lu",
+ ino);
+ return ERR_PTR(-EIO);
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1012,34 +1045,24 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
struct dentry *ext3_get_parent(struct dentry *child)
{
unsigned long ino;
- struct dentry *parent;
- struct inode *inode;
- struct dentry dotdot;
+ struct qstr dotdot = QSTR_INIT("..", 2);
struct ext3_dir_entry_2 * de;
struct buffer_head *bh;
- dotdot.d_name.name = "..";
- dotdot.d_name.len = 2;
- dotdot.d_parent = child; /* confusing, isn't it! */
-
- bh = ext3_find_entry(&dotdot, &de);
- inode = NULL;
+ bh = ext3_find_entry(child->d_inode, &dotdot, &de);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
brelse(bh);
- inode = iget(child->d_inode->i_sb, ino);
- if (!inode)
- return ERR_PTR(-EACCES);
-
- parent = d_alloc_anon(inode);
- if (!parent) {
- iput(inode);
- parent = ERR_PTR(-ENOMEM);
+ if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
+ ext3_error(child->d_inode->i_sb, "ext3_get_parent",
+ "bad inode number: %lu", ino);
+ return ERR_PTR(-EIO);
}
- return parent;
-}
+
+ return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
+}
#define S_SHIFT 12
static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -1059,7 +1082,10 @@ static inline void ext3_set_de_type(struct super_block *sb,
de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
-#ifdef CONFIG_EXT3_INDEX
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
static struct ext3_dir_entry_2 *
dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
{
@@ -1070,7 +1096,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
rec_len = EXT3_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
((struct ext3_dir_entry_2 *) to)->rec_len =
- cpu_to_le16(rec_len);
+ ext3_rec_len_to_disk(rec_len);
de->inode = 0;
map++;
to += rec_len;
@@ -1078,20 +1104,24 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
return (struct ext3_dir_entry_2 *) (to - rec_len);
}
-static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
{
- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
+ struct ext3_dir_entry_2 *next, *to, *prev;
+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
unsigned rec_len = 0;
prev = to = de;
- while ((char*)de < base + size) {
- next = (struct ext3_dir_entry_2 *) ((char *) de +
- le16_to_cpu(de->rec_len));
+ while ((char *)de < base + blocksize) {
+ next = ext3_next_entry(de);
if (de->inode && de->name_len) {
rec_len = EXT3_DIR_REC_LEN(de->name_len);
if (de > to)
memmove(to, de, rec_len);
- to->rec_len = cpu_to_le16(rec_len);
+ to->rec_len = ext3_rec_len_to_disk(rec_len);
prev = to;
to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
}
@@ -1100,6 +1130,11 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
return prev;
}
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct buffer_head **bh,struct dx_frame *frame,
struct dx_hash_info *hinfo, int *error)
@@ -1111,11 +1146,11 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
u32 hash2;
struct dx_map_entry *map;
char *data1 = (*bh)->b_data, *data2;
- unsigned split;
+ unsigned split, move, size;
struct ext3_dir_entry_2 *de = NULL, *de2;
- int err;
+ int err = 0, i;
- bh2 = ext3_append (handle, dir, &newblock, error);
+ bh2 = ext3_append (handle, dir, &newblock, &err);
if (!(bh2)) {
brelse(*bh);
*bh = NULL;
@@ -1124,14 +1159,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
BUFFER_TRACE(*bh, "get_write_access");
err = ext3_journal_get_write_access(handle, *bh);
- if (err) {
- journal_error:
- brelse(*bh);
- brelse(bh2);
- *bh = NULL;
- ext3_std_error(dir->i_sb, err);
- goto errout;
- }
+ if (err)
+ goto journal_error;
+
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext3_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1144,8 +1174,19 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
- split = count/2; // need to adjust to actual middle
dx_sort_map (map, count);
+ /* Split the existing block in the middle, size-wise */
+ size = 0;
+ move = 0;
+ for (i = count-1; i >= 0; i--) {
+ /* is more than half of this entry in 2nd half of the block? */
+ if (size + map[i].size/2 > blocksize/2)
+ break;
+ size += map[i].size;
+ move++;
+ }
+ /* map index at which we will split */
+ split = count - move;
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk("Split block %i at %x, %i/%i\n",
@@ -1154,8 +1195,8 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* Fancy dance to stay within two buffers */
de2 = dx_move_dirents(data1, data2, map + split, count - split);
de = dx_pack_dirents(data1,blocksize);
- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
- de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+ de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+ de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
@@ -1174,10 +1215,17 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
goto journal_error;
brelse (bh2);
dxtrace(dx_show_index ("frame", frame->entries));
-errout:
return de;
+
+journal_error:
+ brelse(*bh);
+ brelse(bh2);
+ *bh = NULL;
+ ext3_std_error(dir->i_sb, err);
+errout:
+ *error = err;
+ return NULL;
}
-#endif
/*
@@ -1187,7 +1235,7 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
+ *
* NOTE! bh is NOT released in the case where ENOSPC is returned. In
* all other cases bh is released.
*/
@@ -1218,7 +1266,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return -EEXIST;
}
nlen = EXT3_DIR_REC_LEN(de->name_len);
- rlen = le16_to_cpu(de->rec_len);
+ rlen = ext3_rec_len_from_disk(de->rec_len);
if ((de->inode? rlen - nlen: rlen) >= reclen)
break;
de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
@@ -1237,11 +1285,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
/* By now the buffer is marked for journaling */
nlen = EXT3_DIR_REC_LEN(de->name_len);
- rlen = le16_to_cpu(de->rec_len);
+ rlen = ext3_rec_len_from_disk(de->rec_len);
if (de->inode) {
struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = cpu_to_le16(rlen - nlen);
- de->rec_len = cpu_to_le16(nlen);
+ de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
+ de->rec_len = ext3_rec_len_to_disk(nlen);
de = de1;
}
de->file_type = EXT3_FT_UNKNOWN;
@@ -1275,7 +1323,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return 0;
}
-#ifdef CONFIG_EXT3_INDEX
/*
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
@@ -1300,7 +1347,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct fake_dirent *fde;
blocksize = dir->i_sb->s_blocksize;
- dxtrace(printk("Creating index\n"));
+ dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
retval = ext3_journal_get_write_access(handle, bh);
if (retval) {
ext3_std_error(dir->i_sb, retval);
@@ -1309,6 +1356,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
}
root = (struct dx_root *) bh->b_data;
+ /* The 0th block becomes the root, move the dirents out */
+ fde = &root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)fde +
+ ext3_rec_len_from_disk(fde->rec_len));
+ if ((char *) de >= (((char *) root) + blocksize)) {
+ ext3_error(dir->i_sb, __func__,
+ "invalid rec_len for '..' in inode %lu",
+ dir->i_ino);
+ brelse(bh);
+ return -EIO;
+ }
+ len = ((char *) root) + blocksize - (char *) de;
+
bh2 = ext3_append (handle, dir, &block, &retval);
if (!(bh2)) {
brelse(bh);
@@ -1317,19 +1377,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
data1 = bh2->b_data;
- /* The 0th block becomes the root, move the dirents out */
- fde = &root->dotdot;
- de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
- len = ((char *) root) + blocksize - (char *) de;
memcpy (data1, de, len);
de = (struct ext3_dir_entry_2 *) data1;
top = data1 + len;
- while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+ while ((char *)(de2 = ext3_next_entry(de)) < top)
de = de2;
- de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
/* Initialize the root; the dot dirents already exist */
de = (struct ext3_dir_entry_2 *) (&root->dotdot);
- de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
+ de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
@@ -1340,6 +1396,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
/* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
ext3fs_dirhash(name, namelen, &hinfo);
frame = frames;
@@ -1347,14 +1405,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+ /*
+ * Mark buffers dirty here so that if do_split() fails we write a
+ * consistent set of buffers to disk.
+ */
+ ext3_journal_dirty_metadata(handle, frame->bh);
+ ext3_journal_dirty_metadata(handle, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ ext3_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
-#endif
/*
* ext3_add_entry()
@@ -1370,23 +1436,18 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
struct inode *inode)
{
struct inode *dir = dentry->d_parent->d_inode;
- unsigned long offset;
struct buffer_head * bh;
struct ext3_dir_entry_2 *de;
struct super_block * sb;
int retval;
-#ifdef CONFIG_EXT3_INDEX
int dx_fallback=0;
-#endif
unsigned blocksize;
- unsigned nlen, rlen;
u32 block, blocks;
sb = dir->i_sb;
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
-#ifdef CONFIG_EXT3_INDEX
if (is_dx(dir)) {
retval = ext3_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -1395,21 +1456,18 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
dx_fallback++;
ext3_mark_inode_dirty(handle, dir);
}
-#endif
blocks = dir->i_size >> sb->s_blocksize_bits;
- for (block = 0, offset = 0; block < blocks; block++) {
- bh = ext3_bread(handle, dir, block, 0, &retval);
- if(!bh)
+ for (block = 0; block < blocks; block++) {
+ if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
return retval;
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC)
return retval;
-#ifdef CONFIG_EXT3_INDEX
if (blocks == 1 && !dx_fallback &&
EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
return make_indexed_dir(handle, dentry, inode, bh);
-#endif
brelse(bh);
}
bh = ext3_append(handle, dir, &block, &retval);
@@ -1417,12 +1475,10 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
return retval;
de = (struct ext3_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = cpu_to_le16(rlen = blocksize);
- nlen = 0;
+ de->rec_len = ext3_rec_len_to_disk(blocksize);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
-#ifdef CONFIG_EXT3_INDEX
/*
* Returns 0 for success, or a negative error value
*/
@@ -1438,13 +1494,13 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct ext3_dir_entry_2 *de;
int err;
- frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
if (!frame)
return err;
entries = frame->entries;
at = frame->at;
- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
goto cleanup;
BUFFER_TRACE(bh, "get_write_access");
@@ -1472,7 +1528,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels && (dx_get_count(frames->entries) ==
dx_get_limit(frames->entries))) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Directory index full!");
err = -ENOSPC;
goto cleanup;
@@ -1482,8 +1538,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
- node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
- node2->fake.inode = 0;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
+ node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext3_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1540,7 +1596,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- ext3_journal_dirty_metadata(handle, frames[0].bh);
+ err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+ if (err)
+ goto journal_error;
}
de = do_split(handle, dir, &bh, frame, &hinfo, &err);
if (!de)
@@ -1557,13 +1615,12 @@ cleanup:
dx_release(frames);
return err;
}
-#endif
/*
* ext3_delete_entry deletes a directory entry by merging it with the
* previous entry
*/
-static int ext3_delete_entry (handle_t *handle,
+static int ext3_delete_entry (handle_t *handle,
struct inode * dir,
struct ext3_dir_entry_2 * de_del,
struct buffer_head * bh)
@@ -1578,52 +1635,48 @@ static int ext3_delete_entry (handle_t *handle,
if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
return -EIO;
if (de == de_del) {
+ int err;
+
BUFFER_TRACE(bh, "get_write_access");
- ext3_journal_get_write_access(handle, bh);
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err)
+ goto journal_error;
+
if (pde)
- pde->rec_len =
- cpu_to_le16(le16_to_cpu(pde->rec_len) +
- le16_to_cpu(de->rec_len));
+ pde->rec_len = ext3_rec_len_to_disk(
+ ext3_rec_len_from_disk(pde->rec_len) +
+ ext3_rec_len_from_disk(de->rec_len));
else
de->inode = 0;
dir->i_version++;
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err) {
+journal_error:
+ ext3_std_error(dir->i_sb, err);
+ return err;
+ }
return 0;
}
- i += le16_to_cpu(de->rec_len);
+ i += ext3_rec_len_from_disk(de->rec_len);
pde = de;
- de = (struct ext3_dir_entry_2 *)
- ((char *) de + le16_to_cpu(de->rec_len));
+ de = ext3_next_entry(de);
}
return -ENOENT;
}
-/*
- * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
- * do not perform it in these functions. We perform it at the call site,
- * if it is needed.
- */
-static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
-{
- inode->i_nlink++;
-}
-
-static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
-{
- inode->i_nlink--;
-}
-
static int ext3_add_nondir(handle_t *handle,
struct dentry *dentry, struct inode *inode)
{
int err = ext3_add_entry(handle, dentry, inode);
if (!err) {
ext3_mark_inode_dirty(handle, inode);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
- ext3_dec_count(handle, inode);
+ drop_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -1634,26 +1687,28 @@ static int ext3_add_nondir(handle_t *handle,
* is so far negative - it has no inode.
*
* If the create succeeds, we fill in the inode information
- * with d_instantiate().
+ * with d_instantiate().
*/
-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+ bool excl)
{
- handle_t *handle;
+ handle_t *handle;
struct inode * inode;
int err, retries = 0;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext3_file_inode_operations;
@@ -1668,7 +1723,7 @@ retry:
}
static int ext3_mknod (struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -1677,17 +1732,19 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1702,28 +1759,68 @@ retry:
return err;
}
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT3_XATTR_TRANS_BLOCKS);
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ inode = ext3_new_inode (handle, dir, NULL, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ ext3_set_aops(inode);
+ d_tmpfile(dentry, inode);
+ err = ext3_orphan_add(handle, inode);
+ if (err)
+ goto err_unlock_inode;
+ mark_inode_dirty(inode);
+ unlock_new_inode(inode);
+ }
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_unlock_inode:
+ ext3_journal_stop(handle);
+ unlock_new_inode(inode);
+ return err;
+}
+
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
handle_t *handle;
struct inode * inode;
- struct buffer_head * dir_block;
+ struct buffer_head * dir_block = NULL;
struct ext3_dir_entry_2 * de;
int err, retries = 0;
if (dir->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -1731,45 +1828,55 @@ retry:
inode->i_op = &ext3_dir_inode_operations;
inode->i_fop = &ext3_dir_operations;
inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
- inode->i_nlink--; /* is this nlink == 0? */
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
- }
+ if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
+ goto out_clear_inode;
+
BUFFER_TRACE(dir_block, "get_write_access");
- ext3_journal_get_write_access(handle, dir_block);
+ err = ext3_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out_clear_inode;
+
de = (struct ext3_dir_entry_2 *) dir_block->b_data;
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+ de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
strcpy (de->name, ".");
ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- de = (struct ext3_dir_entry_2 *)
- ((char *) de + le16_to_cpu(de->rec_len));
+ de = ext3_next_entry(de);
de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+ de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
+ EXT3_DIR_REC_LEN(1));
de->name_len = 2;
strcpy (de->name, "..");
ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_block);
- brelse (dir_block);
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_entry (handle, dentry, inode);
+ err = ext3_journal_dirty_metadata(handle, dir_block);
+ if (err)
+ goto out_clear_inode;
+
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (!err)
+ err = ext3_add_entry (handle, dentry, inode);
+
if (err) {
- inode->i_nlink = 0;
+out_clear_inode:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
}
- dir->i_nlink++;
+ inc_nlink(dir);
ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
+ err = ext3_mark_inode_dirty(handle, dir);
+ if (err)
+ goto out_clear_inode;
+
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
out_stop:
+ brelse(dir_block);
ext3_journal_stop(handle);
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
@@ -1789,43 +1896,41 @@ static int empty_dir (struct inode * inode)
sb = inode->i_sb;
if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
- !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+ !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
if (err)
- ext3_error(inode->i_sb, __FUNCTION__,
+ ext3_error(inode->i_sb, __func__,
"error %d reading directory #%lu offset 0",
err, inode->i_ino);
else
- ext3_warning(inode->i_sb, __FUNCTION__,
+ ext3_warning(inode->i_sb, __func__,
"bad directory (dir #%lu) - no data block",
inode->i_ino);
return 1;
}
de = (struct ext3_dir_entry_2 *) bh->b_data;
- de1 = (struct ext3_dir_entry_2 *)
- ((char *) de + le16_to_cpu(de->rec_len));
+ de1 = ext3_next_entry(de);
if (le32_to_cpu(de->inode) != inode->i_ino ||
- !le32_to_cpu(de1->inode) ||
+ !le32_to_cpu(de1->inode) ||
strcmp (".", de->name) ||
strcmp ("..", de1->name)) {
- ext3_warning (inode->i_sb, "empty_dir",
+ ext3_warning (inode->i_sb, "empty_dir",
"bad directory (dir #%lu) - no `.' or `..'",
inode->i_ino);
brelse (bh);
return 1;
}
- offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
- de = (struct ext3_dir_entry_2 *)
- ((char *) de1 + le16_to_cpu(de1->rec_len));
+ offset = ext3_rec_len_from_disk(de->rec_len) +
+ ext3_rec_len_from_disk(de1->rec_len);
+ de = ext3_next_entry(de1);
while (offset < inode->i_size ) {
if (!bh ||
(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
err = 0;
brelse (bh);
- bh = ext3_bread (NULL, inode,
- offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
- if (!bh) {
+ if (!(bh = ext3_dir_bread (NULL, inode,
+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
if (err)
- ext3_error(sb, __FUNCTION__,
+ ext3_error(sb, __func__,
"error %d reading directory"
" #%lu offset %lu",
err, inode->i_ino, offset);
@@ -1844,9 +1949,8 @@ static int empty_dir (struct inode * inode)
brelse (bh);
return 0;
}
- offset += le16_to_cpu(de->rec_len);
- de = (struct ext3_dir_entry_2 *)
- ((char *) de + le16_to_cpu(de->rec_len));
+ offset += ext3_rec_len_from_disk(de->rec_len);
+ de = ext3_next_entry(de);
}
brelse (bh);
return 1;
@@ -1866,7 +1970,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
struct ext3_iloc iloc;
int err = 0, rc;
- lock_super(sb);
+ mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
if (!list_empty(&EXT3_I(inode)->i_orphan))
goto out_unlock;
@@ -1874,10 +1978,14 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
* being truncated, or files being unlinked. */
/* @@@ FIXME: Observation from aviro:
- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
- * here (on lock_super()), so race with ext3_link() which might bump
+ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
+ * here (on s_orphan_lock), so race with ext3_link() which might bump
* ->i_nlink. For, say it, character device. Not a regular file,
* not a directory, not a symlink and ->i_nlink > 0.
+ *
+ * tytso, 4/25/2009: I'm not sure how that could happen;
+ * shouldn't the fs core protect us from these sort of
+ * unlink()/link() races?
*/
J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -1910,11 +2018,11 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
if (!err)
list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
- jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
- jbd_debug(4, "orphan inode %ld will point to %d\n",
+ jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
out_unlock:
- unlock_super(sb);
+ mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
ext3_std_error(inode->i_sb, err);
return err;
}
@@ -1932,11 +2040,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
struct ext3_iloc iloc;
int err = 0;
- lock_super(inode->i_sb);
- if (list_empty(&ei->i_orphan)) {
- unlock_super(inode->i_sb);
- return 0;
- }
+ mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
+ if (list_empty(&ei->i_orphan))
+ goto out;
ino_next = NEXT_ORPHAN(inode);
prev = ei->i_orphan.prev;
@@ -1986,7 +2092,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
out_err:
ext3_std_error(inode->i_sb, err);
out:
- unlock_super(inode->i_sb);
+ mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
return err;
out_brelse:
@@ -2004,13 +2110,15 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- DQUOT_INIT(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
retval = -ENOENT;
- bh = ext3_find_entry (dentry, &de);
+ bh = ext3_find_entry(dir, &dentry->d_name, &de);
if (!bh)
goto end_rmdir;
@@ -2035,7 +2143,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
"empty directory has nlink!=2 (%d)",
inode->i_nlink);
inode->i_version++;
- inode->i_nlink = 0;
+ clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
* zero will ensure that the right thing happens during any
* recovery. */
@@ -2043,7 +2151,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
- dir->i_nlink--;
+ drop_nlink(dir);
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
@@ -2061,9 +2169,12 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
struct ext3_dir_entry_2 * de;
handle_t *handle;
+ trace_ext3_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- DQUOT_INIT(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2072,7 +2183,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
handle->h_sync = 1;
retval = -ENOENT;
- bh = ext3_find_entry (dentry, &de);
+ bh = ext3_find_entry(dir, &dentry->d_name, &de);
if (!bh)
goto end_unlink;
@@ -2086,7 +2197,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
ext3_warning (inode->i_sb, "ext3_unlink",
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
@@ -2094,7 +2205,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
- inode->i_nlink--;
+ drop_nlink(inode);
if (!inode->i_nlink)
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime;
@@ -2104,6 +2215,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
end_unlink:
ext3_journal_stop(handle);
brelse (bh);
+ trace_ext3_unlink_exit(dentry, retval);
return retval;
}
@@ -2113,40 +2225,86 @@ static int ext3_symlink (struct inode * dir,
handle_t *handle;
struct inode * inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
+ dquot_initialize(dir);
+
+ if (l > EXT3_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks, and
+ * possibly selinux xattr blocks.
+ */
+ credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT3_XATTR_TRANS_BLOCKS;
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+ handle = ext3_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT3_I(inode)->i_data)) {
+ if (l > EXT3_N_BLOCKS * 4) {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
/*
- * page_symlink() calls into ext3_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext3_write_begin() which acquires page
+ * lock which ranks below transaction start (and it can also
+ * wait for journal commit if we are running out of space). So
+ * we have to stop transaction now and restart it when symlink
+ * contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
+ */
+ drop_nlink(inode);
+ err = ext3_orphan_add(handle, inode);
+ ext3_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
+ err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+ * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
- err = page_symlink(inode, symname, l);
+ handle = ext3_journal_start(dir,
+ EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ set_nlink(inode, 1);
+ err = ext3_orphan_del(handle, inode);
if (err) {
- ext3_dec_count(handle, inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
+ ext3_journal_stop(handle);
+ drop_nlink(inode);
+ goto err_drop_inode;
}
} else {
inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2160,6 +2318,10 @@ out_stop:
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext3_link (struct dentry * old_dentry,
@@ -2172,9 +2334,11 @@ static int ext3_link (struct dentry * old_dentry,
if (inode->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2182,10 +2346,22 @@ retry:
handle->h_sync = 1;
inode->i_ctime = CURRENT_TIME_SEC;
- ext3_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
+ inc_nlink(inode);
+ ihold(inode);
- err = ext3_add_nondir(handle, dentry, inode);
+ err = ext3_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext3_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext3_orphan_del(handle, inode);
+ d_instantiate(dentry, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
ext3_journal_stop(handle);
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
@@ -2193,8 +2369,7 @@ retry:
}
#define PARENT_INO(buffer) \
- ((struct ext3_dir_entry_2 *) ((char *) buffer + \
- le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
+ (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
/*
* Anybody can rename anything with this: the permission checks are left to the
@@ -2207,24 +2382,27 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
struct inode * old_inode, * new_inode;
struct buffer_head * old_bh, * new_bh, * dir_bh;
struct ext3_dir_entry_2 * old_de, * new_de;
- int retval;
+ int retval, flush_file = 0;
+
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
old_bh = new_bh = dir_bh = NULL;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
if (new_dentry->d_inode)
- DQUOT_INIT(new_dentry->d_inode);
+ dquot_initialize(new_dentry->d_inode);
handle = ext3_journal_start(old_dir, 2 *
EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
handle->h_sync = 1;
- old_bh = ext3_find_entry (old_dentry, &old_de);
+ old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -2237,7 +2415,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
new_inode = new_dentry->d_inode;
- new_bh = ext3_find_entry (new_dentry, &new_de);
+ new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
if (new_bh) {
if (!new_inode) {
brelse (new_bh);
@@ -2251,7 +2429,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
- dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+ dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
@@ -2267,14 +2445,20 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
} else {
BUFFER_TRACE(new_bh, "get write access");
- ext3_journal_get_write_access(handle, new_bh);
+ retval = ext3_journal_get_write_access(handle, new_bh);
+ if (retval)
+ goto journal_error;
new_de->inode = cpu_to_le32(old_inode->i_ino);
if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
EXT3_FEATURE_INCOMPAT_FILETYPE))
new_de->file_type = old_de->file_type;
new_dir->i_version++;
+ new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, new_bh);
+ retval = ext3_journal_dirty_metadata(handle, new_bh);
+ if (retval)
+ goto journal_error;
brelse(new_bh);
new_bh = NULL;
}
@@ -2301,7 +2485,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh2;
struct ext3_dir_entry_2 *old_de2;
- old_bh2 = ext3_find_entry(old_dentry, &old_de2);
+ old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
+ &old_de2);
if (old_bh2) {
retval = ext3_delete_entry(handle, old_dir,
old_de2, old_bh2);
@@ -2315,22 +2500,29 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME_SEC;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
ext3_update_dx_flag(old_dir);
if (dir_bh) {
BUFFER_TRACE(dir_bh, "get_write_access");
- ext3_journal_get_write_access(handle, dir_bh);
+ retval = ext3_journal_get_write_access(handle, dir_bh);
+ if (retval)
+ goto journal_error;
PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_bh);
- old_dir->i_nlink--;
+ retval = ext3_journal_dirty_metadata(handle, dir_bh);
+ if (retval) {
+journal_error:
+ ext3_std_error(new_dir->i_sb, retval);
+ goto end_rename;
+ }
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
- new_dir->i_nlink++;
+ inc_nlink(new_dir);
ext3_update_dx_flag(new_dir);
ext3_mark_inode_dirty(handle, new_dir);
}
@@ -2340,6 +2532,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
ext3_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext3_orphan_add(handle, new_inode);
+ if (ext3_should_writeback_data(new_inode))
+ flush_file = 1;
}
retval = 0;
@@ -2348,13 +2542,15 @@ end_rename:
brelse (old_bh);
brelse (new_bh);
ext3_journal_stop(handle);
+ if (retval == 0 && flush_file)
+ filemap_flush(old_inode->i_mapping);
return retval;
}
/*
* directories can handle most operations...
*/
-struct inode_operations ext3_dir_inode_operations = {
+const struct inode_operations ext3_dir_inode_operations = {
.create = ext3_create,
.lookup = ext3_lookup,
.link = ext3_link,
@@ -2363,6 +2559,7 @@ struct inode_operations ext3_dir_inode_operations = {
.mkdir = ext3_mkdir,
.rmdir = ext3_rmdir,
.mknod = ext3_mknod,
+ .tmpfile = ext3_tmpfile,
.rename = ext3_rename,
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
@@ -2371,10 +2568,11 @@ struct inode_operations ext3_dir_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
};
-struct inode_operations ext3_special_inode_operations = {
+const struct inode_operations ext3_special_inode_operations = {
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
@@ -2382,5 +2580,6 @@ struct inode_operations ext3_special_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
-};
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
+};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
index f2ce2b0065c..46304d8c9f0 100644
--- a/fs/ext3/namei.h
+++ b/fs/ext3/namei.h
@@ -6,3 +6,22 @@
*/
extern struct dentry *ext3_get_parent(struct dentry *child);
+
+static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
+ struct inode *inode,
+ int block, int create,
+ int *err)
+{
+ struct buffer_head *bh;
+
+ bh = ext3_bread(handle, inode, block, create, err);
+
+ if (!bh && !(*err)) {
+ *err = -EIO;
+ ext3_error(inode->i_sb, __func__,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ return NULL;
+ }
+ return bh;
+}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1041dab6de2..27105655502 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -8,16 +8,10 @@
* This could probably be made into a module, because it is not often in use.
*/
-#include <linux/config.h>
#define EXT3FS_DEBUG
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/ext3_jbd.h>
-
-#include <linux/errno.h>
-#include <linux/slab.h>
+#include "ext3.h"
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
@@ -28,16 +22,16 @@ static int verify_group_input(struct super_block *sb,
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
struct ext3_super_block *es = sbi->s_es;
- unsigned start = le32_to_cpu(es->s_blocks_count);
- unsigned end = start + input->blocks_count;
+ ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+ ext3_fsblk_t end = start + input->blocks_count;
unsigned group = input->group;
- unsigned itend = input->inode_table + sbi->s_itb_per_group;
+ ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
unsigned overhead = ext3_bg_has_super(sb, group) ?
(1 + ext3_bg_num_gdb(sb, group) +
le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
- unsigned metaend = start + overhead;
+ ext3_fsblk_t metaend = start + overhead;
struct buffer_head *bh = NULL;
- int free_blocks_count;
+ ext3_grpblk_t free_blocks_count;
int err = -EINVAL;
input->free_blocks_count = free_blocks_count =
@@ -51,58 +45,62 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
if (group != sbi->s_groups_count)
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Cannot add at group %u (only %lu groups)",
input->group, sbi->s_groups_count);
else if ((start - le32_to_cpu(es->s_first_data_block)) %
EXT3_BLOCKS_PER_GROUP(sb))
- ext3_warning(sb, __FUNCTION__, "Last group not full");
+ ext3_warning(sb, __func__, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
- ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+ ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
input->reserved_blocks);
else if (free_blocks_count < 0)
- ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
+ ext3_warning(sb, __func__, "Bad blocks count %u",
input->blocks_count);
else if (!(bh = sb_bread(sb, end - 1)))
- ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+ ext3_warning(sb, __func__,
+ "Cannot read last block ("E3FSBLK")",
end - 1);
else if (outside(input->block_bitmap, start, end))
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Block bitmap not in group (block %u)",
input->block_bitmap);
else if (outside(input->inode_bitmap, start, end))
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Inode bitmap not in group (block %u)",
input->inode_bitmap);
else if (outside(input->inode_table, start, end) ||
outside(itend - 1, start, end))
- ext3_warning(sb, __FUNCTION__,
- "Inode table not in group (blocks %u-%u)",
+ ext3_warning(sb, __func__,
+ "Inode table not in group (blocks %u-"E3FSBLK")",
input->inode_table, itend - 1);
else if (input->inode_bitmap == input->block_bitmap)
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Block bitmap same as inode bitmap (%u)",
input->block_bitmap);
else if (inside(input->block_bitmap, input->inode_table, itend))
- ext3_warning(sb, __FUNCTION__,
- "Block bitmap (%u) in inode table (%u-%u)",
+ ext3_warning(sb, __func__,
+ "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
input->block_bitmap, input->inode_table, itend-1);
else if (inside(input->inode_bitmap, input->inode_table, itend))
- ext3_warning(sb, __FUNCTION__,
- "Inode bitmap (%u) in inode table (%u-%u)",
+ ext3_warning(sb, __func__,
+ "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
input->inode_bitmap, input->inode_table, itend-1);
else if (inside(input->block_bitmap, start, metaend))
- ext3_warning(sb, __FUNCTION__,
- "Block bitmap (%u) in GDT table (%u-%u)",
+ ext3_warning(sb, __func__,
+ "Block bitmap (%u) in GDT table"
+ " ("E3FSBLK"-"E3FSBLK")",
input->block_bitmap, start, metaend - 1);
else if (inside(input->inode_bitmap, start, metaend))
- ext3_warning(sb, __FUNCTION__,
- "Inode bitmap (%u) in GDT table (%u-%u)",
+ ext3_warning(sb, __func__,
+ "Inode bitmap (%u) in GDT table"
+ " ("E3FSBLK"-"E3FSBLK")",
input->inode_bitmap, start, metaend - 1);
else if (inside(input->inode_table, start, metaend) ||
inside(itend - 1, start, metaend))
- ext3_warning(sb, __FUNCTION__,
- "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+ ext3_warning(sb, __func__,
+ "Inode table (%u-"E3FSBLK") overlaps"
+ "GDT table ("E3FSBLK"-"E3FSBLK")",
input->inode_table, itend - 1, start, metaend - 1);
else
err = 0;
@@ -112,14 +110,14 @@ static int verify_group_input(struct super_block *sb,
}
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
- unsigned long blk)
+ ext3_fsblk_t blk)
{
struct buffer_head *bh;
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -153,6 +151,34 @@ static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
}
/*
+ * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh,
+ struct buffer_head *bh)
+{
+ int err;
+
+ if (handle->h_buffer_credits >= thresh)
+ return 0;
+
+ err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
+ if (err < 0)
+ return err;
+ if (err) {
+ err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
+ if (err)
+ return err;
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
* Set up the block and inode bitmaps, and the inode table for the new group.
* This doesn't need to be part of the main transaction, since we are only
* changing blocks outside the actual filesystem. We still do journaling to
@@ -163,24 +189,24 @@ static int setup_new_group_blocks(struct super_block *sb,
struct ext3_new_group_data *input)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
- unsigned long start = input->group * sbi->s_blocks_per_group +
- le32_to_cpu(sbi->s_es->s_first_data_block);
+ ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
struct buffer_head *bh;
handle_t *handle;
- unsigned long block;
- int bit;
+ ext3_fsblk_t block;
+ ext3_grpblk_t bit;
int i;
int err = 0, err2;
- handle = ext3_journal_start_sb(sb, reserved_gdb + gdblocks +
- 2 + sbi->s_itb_per_group);
+ /* This transaction may be extended/restarted along the way */
+ handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
+
if (IS_ERR(handle))
return PTR_ERR(handle);
- lock_super(sb);
+ mutex_lock(&sbi->s_resize_lock);
if (input->group != sbi->s_groups_count) {
err = -EBUSY;
goto exit_journal;
@@ -203,20 +229,28 @@ static int setup_new_group_blocks(struct super_block *sb,
ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
+ err = extend_or_restart_transaction(handle, 1, bh);
+ if (err)
+ goto exit_bh;
+
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto exit_bh;
}
if ((err = ext3_journal_get_write_access(handle, gdb))) {
brelse(gdb);
goto exit_bh;
}
- lock_buffer(bh);
- memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size);
+ lock_buffer(gdb);
+ memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
- unlock_buffer(bh);
- ext3_journal_dirty_metadata(handle, gdb);
+ unlock_buffer(gdb);
+ err = ext3_journal_dirty_metadata(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto exit_bh;
+ }
ext3_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -228,11 +262,19 @@ static int setup_new_group_blocks(struct super_block *sb,
ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+ err = extend_or_restart_transaction(handle, 1, bh);
+ if (err)
+ goto exit_bh;
+
if (IS_ERR(gdb = bclean(handle, sb, block))) {
- err = PTR_ERR(bh);
+ err = PTR_ERR(gdb);
+ goto exit_bh;
+ }
+ err = ext3_journal_dirty_metadata(handle, gdb);
+ if (err) {
+ brelse(gdb);
goto exit_bh;
}
- ext3_journal_dirty_metadata(handle, gdb);
ext3_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -249,17 +291,33 @@ static int setup_new_group_blocks(struct super_block *sb,
struct buffer_head *it;
ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
+
+ err = extend_or_restart_transaction(handle, 1, bh);
+ if (err)
+ goto exit_bh;
+
if (IS_ERR(it = bclean(handle, sb, block))) {
err = PTR_ERR(it);
goto exit_bh;
}
- ext3_journal_dirty_metadata(handle, it);
+ err = ext3_journal_dirty_metadata(handle, it);
+ if (err) {
+ brelse(it);
+ goto exit_bh;
+ }
brelse(it);
ext3_set_bit(bit, bh->b_data);
}
+
+ err = extend_or_restart_transaction(handle, 2, bh);
+ if (err)
+ goto exit_bh;
+
mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
bh->b_data);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto exit_bh;
brelse(bh);
/* Mark unused entries in inode bitmap used */
@@ -272,12 +330,12 @@ static int setup_new_group_blocks(struct super_block *sb,
mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
bh->b_data);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
exit_bh:
brelse(bh);
exit_journal:
- unlock_super(sb);
+ mutex_unlock(&sbi->s_resize_lock);
if ((err2 = ext3_journal_stop(handle)) && !err)
err = err2;
@@ -328,19 +386,20 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
static int verify_reserved_gdb(struct super_block *sb,
struct buffer_head *primary)
{
- const unsigned long blk = primary->b_blocknr;
+ const ext3_fsblk_t blk = primary->b_blocknr;
const unsigned long end = EXT3_SB(sb)->s_groups_count;
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
unsigned grp;
- __u32 *p = (__u32 *)primary->b_data;
+ __le32 *p = (__le32 *)primary->b_data;
int gdbackups = 0;
while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
- ext3_warning(sb, __FUNCTION__,
- "reserved GDT %ld missing grp %d (%ld)",
+ ext3_warning(sb, __func__,
+ "reserved GDT "E3FSBLK
+ " missing grp %d ("E3FSBLK")",
blk, grp,
grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
return -EINVAL;
@@ -372,12 +431,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
struct super_block *sb = inode->i_sb;
struct ext3_super_block *es = EXT3_SB(sb)->s_es;
unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
- unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+ ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
struct buffer_head **o_group_desc, **n_group_desc;
struct buffer_head *dind;
int gdbackups;
struct ext3_iloc iloc;
- __u32 *data;
+ __le32 *data;
int err;
if (test_opt(sb, DEBUG))
@@ -392,7 +451,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
*/
if (EXT3_SB(sb)->s_sbh->b_blocknr !=
le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"won't resize using backup superblock at %llu",
(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
return -EPERM;
@@ -414,10 +473,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
goto exit_bh;
}
- data = (__u32 *)dind->b_data;
+ data = (__le32 *)dind->b_data;
if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
- ext3_warning(sb, __FUNCTION__,
- "new group %u GDT block %lu not reserved",
+ ext3_warning(sb, __func__,
+ "new group %u GDT block "E3FSBLK" not reserved",
input->group, gdblock);
err = -EINVAL;
goto exit_dind;
@@ -436,11 +495,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
goto exit_dindj;
- n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *), GFP_KERNEL);
+ n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_NOFS);
if (!n_group_desc) {
err = -ENOMEM;
- ext3_warning (sb, __FUNCTION__,
+ ext3_warning (sb, __func__,
"not enough memory for %lu groups", gdb_num + 1);
goto exit_inode;
}
@@ -455,12 +514,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
* reserved inode, and will become GDT blocks (primary and backup).
*/
data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
- ext3_journal_dirty_metadata(handle, dind);
+ err = ext3_journal_dirty_metadata(handle, dind);
+ if (err)
+ goto exit_group_desc;
brelse(dind);
+ dind = NULL;
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
- ext3_mark_iloc_dirty(handle, inode, &iloc);
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto exit_group_desc;
memset((*primary)->b_data, 0, sb->s_blocksize);
- ext3_journal_dirty_metadata(handle, *primary);
+ err = ext3_journal_dirty_metadata(handle, *primary);
+ if (err)
+ goto exit_group_desc;
o_group_desc = EXT3_SB(sb)->s_group_desc;
memcpy(n_group_desc, o_group_desc,
@@ -470,12 +536,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
EXT3_SB(sb)->s_gdb_count++;
kfree(o_group_desc);
- es->s_reserved_gdt_blocks =
- cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
- ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ if (err)
+ goto exit_inode;
return 0;
+exit_group_desc:
+ kfree(n_group_desc);
exit_inode:
//ext3_journal_release_buffer(handle, iloc.bh);
brelse(iloc.bh);
@@ -515,13 +584,13 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
struct buffer_head **primary;
struct buffer_head *dind;
struct ext3_iloc iloc;
- unsigned long blk;
- __u32 *data, *end;
+ ext3_fsblk_t blk;
+ __le32 *data, *end;
int gdbackups = 0;
int res, i;
int err;
- primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+ primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
if (!primary)
return -ENOMEM;
@@ -533,15 +602,18 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
- data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
- end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
+ data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+ EXT3_ADDR_PER_BLOCK(sb));
+ end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
/* Get each reserved primary GDT block and verify it holds backups */
for (res = 0; res < reserved_gdb; res++, blk++) {
if (le32_to_cpu(*data) != blk) {
- ext3_warning(sb, __FUNCTION__,
- "reserved block %lu not at offset %ld",
- blk, (long)(data - (__u32 *)dind->b_data));
+ ext3_warning(sb, __func__,
+ "reserved block "E3FSBLK
+ " not at offset %ld",
+ blk,
+ (long)(data - (__le32 *)dind->b_data));
err = -EINVAL;
goto exit_bh;
}
@@ -556,7 +628,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
goto exit_bh;
}
if (++data >= end)
- data = (__u32 *)dind->b_data;
+ data = (__le32 *)dind->b_data;
}
for (i = 0; i < reserved_gdb; i++) {
@@ -580,7 +652,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
for (i = 0; i < reserved_gdb; i++) {
int err2;
- data = (__u32 *)primary[i]->b_data;
+ data = (__le32 *)primary[i]->b_data;
/* printk("reserving backup %lu[%u] = %lu\n",
primary[i]->b_blocknr, gdbackups,
blk + primary[i]->b_blocknr); */
@@ -612,11 +684,12 @@ exit_free:
* important part is that the new block and inode counts are in the backup
* superblocks, and the location of the new group metadata in the GDT backups.
*
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted. We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time. The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted. We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time. The resize
+ * which changed s_groups_count will backup again.
*/
static void update_backups(struct super_block *sb,
int blk_off, char *data, int size)
@@ -649,22 +722,26 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext3_debug("update metadata backup %#04lx\n",
(unsigned long)bh->b_blocknr);
- if ((err = ext3_journal_get_write_access(handle, bh)))
+ if ((err = ext3_journal_get_write_access(handle, bh))) {
+ brelse(bh);
break;
+ }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
brelse(bh);
+ if (err)
+ break;
}
if ((err2 = ext3_journal_stop(handle)) && !err)
err = err2;
@@ -681,11 +758,11 @@ static void update_backups(struct super_block *sb,
*/
exit_err:
if (err) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"can't update backup for group %d (err %d), "
"forcing fsck on next reboot", group, err);
sbi->s_mount_state &= ~EXT3_VALID_FS;
- sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
+ sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
mark_buffer_dirty(sbi->s_sbh);
}
}
@@ -721,24 +798,36 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"Can't resize non-sparse filesystem further");
return -EPERM;
}
+ if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
+ le32_to_cpu(es->s_blocks_count)) {
+ ext3_warning(sb, __func__, "blocks_count overflow\n");
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
+ le32_to_cpu(es->s_inodes_count)) {
+ ext3_warning(sb, __func__, "inodes_count overflow\n");
+ return -EINVAL;
+ }
+
if (reserved_gdb || gdb_off == 0) {
if (!EXT3_HAS_COMPAT_FEATURE(sb,
- EXT3_FEATURE_COMPAT_RESIZE_INODE)){
- ext3_warning(sb, __FUNCTION__,
+ EXT3_FEATURE_COMPAT_RESIZE_INODE)
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ ext3_warning(sb, __func__,
"No reserved GDT blocks, can't resize");
return -EPERM;
}
- inode = iget(sb, EXT3_RESIZE_INO);
- if (!inode || is_bad_inode(inode)) {
- ext3_warning(sb, __FUNCTION__,
+ inode = ext3_iget(sb, EXT3_RESIZE_INO);
+ if (IS_ERR(inode)) {
+ ext3_warning(sb, __func__,
"Error opening resize inode");
- iput(inode);
- return -ENOENT;
+ return PTR_ERR(inode);
}
}
@@ -763,9 +852,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
goto exit_put;
}
- lock_super(sb);
+ mutex_lock(&sbi->s_resize_lock);
if (input->group != sbi->s_groups_count) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"multiple resizers run on filesystem!");
err = -EBUSY;
goto exit_journal;
@@ -794,7 +883,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
/*
* OK, now we've set up the new group. Time to make it active.
*
- * Current kernels don't lock all allocations via lock_super(),
+ * We do not lock all allocations via s_resize_lock
* so we have to be safe wrt. concurrent accesses the group
* data. So we need to be careful to set all of the relevant
* group descriptor data etc. *before* we enable the group.
@@ -829,10 +918,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
* blocks/inodes before the group is live won't actually let us
* allocate the new space yet.
*/
- es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) +
- input->blocks_count);
- es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
- EXT3_INODES_PER_GROUP(sb));
+ le32_add_cpu(&es->s_blocks_count, input->blocks_count);
+ le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
/*
* We need to protect s_groups_count against other CPUs seeing
@@ -840,12 +927,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
*
* The precise rules we use are:
*
- * * Writers of s_groups_count *must* hold lock_super
+ * * Writers of s_groups_count *must* hold s_resize_lock
* AND
* * Writers must perform a smp_wmb() after updating all dependent
* data and before modifying the groups count
*
- * * Readers must hold lock_super() over the access
+ * * Readers must hold s_resize_lock over the access
* OR
* * Readers must perform an smp_rmb() after reading the groups count
* and before reading any dependent data.
@@ -861,24 +948,24 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext3_journal_dirty_metadata(handle, primary);
+ err = ext3_journal_dirty_metadata(handle, primary);
+ if (err)
+ goto exit_journal;
/* Update the reserved block counts only once the new group is
* active. */
- es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) +
- input->reserved_blocks);
+ le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
/* Update the free space counts */
- percpu_counter_mod(&sbi->s_freeblocks_counter,
+ percpu_counter_add(&sbi->s_freeblocks_counter,
input->free_blocks_count);
- percpu_counter_mod(&sbi->s_freeinodes_counter,
+ percpu_counter_add(&sbi->s_freeinodes_counter,
EXT3_INODES_PER_GROUP(sb));
- ext3_journal_dirty_metadata(handle, sbi->s_sbh);
- sb->s_dirt = 1;
+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
exit_journal:
- unlock_super(sb);
+ mutex_unlock(&sbi->s_resize_lock);
if ((err2 = ext3_journal_stop(handle)) && !err)
err = err2;
if (!err) {
@@ -902,31 +989,41 @@ exit_put:
* GDT blocks are reserved to grow to the desired size.
*/
int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
- unsigned long n_blocks_count)
+ ext3_fsblk_t n_blocks_count)
{
- unsigned long o_blocks_count;
- unsigned long o_groups_count;
- unsigned long last;
- int add;
+ ext3_fsblk_t o_blocks_count;
+ ext3_grpblk_t last;
+ ext3_grpblk_t add;
struct buffer_head * bh;
handle_t *handle;
- int err, freed_blocks;
+ int err;
+ unsigned long freed_blocks;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
- * taking lock_super() below. */
+ * taking the s_resize_lock below. */
o_blocks_count = le32_to_cpu(es->s_blocks_count);
- o_groups_count = EXT3_SB(sb)->s_groups_count;
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+ printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+ " up to "E3FSBLK" blocks\n",
o_blocks_count, n_blocks_count);
if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
return 0;
+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+ " too large to resize to "E3FSBLK" blocks safely\n",
+ sb->s_id, n_blocks_count);
+ if (sizeof(sector_t) < 8)
+ ext3_warning(sb, __func__,
+ "CONFIG_LBDAF not enabled\n");
+ return -EINVAL;
+ }
+
if (n_blocks_count < o_blocks_count) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"can't shrink FS - resize aborted");
return -EBUSY;
}
@@ -936,25 +1033,31 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
EXT3_BLOCKS_PER_GROUP(sb);
if (last == 0) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"need to use ext2online to resize further");
return -EPERM;
}
add = EXT3_BLOCKS_PER_GROUP(sb) - last;
+ if (o_blocks_count + add < o_blocks_count) {
+ ext3_warning(sb, __func__, "blocks_count overflow");
+ return -EINVAL;
+ }
+
if (o_blocks_count + add > n_blocks_count)
add = n_blocks_count - o_blocks_count;
if (o_blocks_count + add < n_blocks_count)
- ext3_warning(sb, __FUNCTION__,
- "will only finish group (%lu blocks, %u new)",
+ ext3_warning(sb, __func__,
+ "will only finish group ("E3FSBLK
+ " blocks, %u new)",
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
bh = sb_bread(sb, o_blocks_count + add -1);
if (!bh) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"can't read last block, resize aborted");
return -ENOSPC;
}
@@ -966,35 +1069,42 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
handle = ext3_journal_start_sb(sb, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- ext3_warning(sb, __FUNCTION__, "error %d on journal start",err);
+ ext3_warning(sb, __func__, "error %d on journal start",err);
goto exit_put;
}
- lock_super(sb);
+ mutex_lock(&EXT3_SB(sb)->s_resize_lock);
if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"multiple resizers run on filesystem!");
+ mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+ ext3_journal_stop(handle);
err = -EBUSY;
goto exit_put;
}
if ((err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh))) {
- ext3_warning(sb, __FUNCTION__,
+ ext3_warning(sb, __func__,
"error %d on journal write access", err);
- unlock_super(sb);
+ mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
ext3_journal_stop(handle);
goto exit_put;
}
es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
- ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- sb->s_dirt = 1;
- unlock_super(sb);
- ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
- o_blocks_count + add);
+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+ if (err) {
+ ext3_warning(sb, __func__,
+ "error %d on journal dirty metadata", err);
+ ext3_journal_stop(handle);
+ goto exit_put;
+ }
+ ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+ o_blocks_count, o_blocks_count + add);
ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
- ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
- o_blocks_count + add);
+ ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
+ o_blocks_count, o_blocks_count + add);
if ((err = ext3_journal_stop(handle)))
goto exit_put;
if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 56bf7658601..08cdfe5461e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -16,39 +16,40 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/config.h>
#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/slab.h>
-#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/parser.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
+#include <linux/exportfs.h>
+#include <linux/statfs.h>
#include <linux/random.h>
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+#include <linux/log2.h>
+#include <linux/cleancache.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
+#define CREATE_TRACE_POINTS
+
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
#include "namei.h"
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+ #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+ #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
unsigned long journal_devnum);
static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
- int);
-static void ext3_commit_super (struct super_block * sb,
- struct ext3_super_block * es,
+ unsigned int);
+static int ext3_commit_super(struct super_block *sb,
+ struct ext3_super_block *es,
int sync);
static void ext3_mark_recovery_complete(struct super_block * sb,
struct ext3_super_block * es);
@@ -58,18 +59,12 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
static const char *ext3_decode_error(struct super_block * sb, int errno,
char nbuf[16]);
static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
-static void ext3_unlockfs(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
-static void ext3_write_super_lockfs(struct super_block *sb);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext3_unfreeze(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
-/*
+/*
* Wrappers for journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
*/
handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
{
@@ -83,7 +78,7 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
* take the FS itself readonly cleanly. */
journal = EXT3_SB(sb)->s_journal;
if (is_journal_aborted(journal)) {
- ext3_abort(sb, __FUNCTION__,
+ ext3_abort(sb, __func__,
"Detected aborted journal");
return ERR_PTR(-EROFS);
}
@@ -91,12 +86,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
return journal_start(journal, nblocks);
}
-/*
- * The only special thing we need to do here is to make sure that all
- * journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
int __ext3_journal_stop(const char *where, handle_t *handle)
{
struct super_block *sb;
@@ -129,12 +118,28 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
if (is_handle_aborted(handle))
return;
- printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
- caller, errstr, err_fn);
+ printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
+ caller, errstr, err_fn);
journal_abort_handle(handle);
}
+void ext3_msg(struct super_block *sb, const char *prefix,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
+ va_end(args);
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -146,7 +151,7 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
* write out the superblock safely.
*
* We'll just use the journal_abort() error code to record an error in
- * the journal instead. On recovery, the journal will compain about
+ * the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
*/
@@ -160,31 +165,43 @@ static void ext3_handle_error(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return;
- if (test_opt (sb, ERRORS_RO)) {
- printk (KERN_CRIT "Remounting filesystem read-only\n");
- sb->s_flags |= MS_RDONLY;
- } else {
+ if (!test_opt (sb, ERRORS_CONT)) {
journal_t *journal = EXT3_SB(sb)->s_journal;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
if (journal)
journal_abort(journal, -EIO);
}
+ if (test_opt (sb, ERRORS_RO)) {
+ ext3_msg(sb, KERN_CRIT,
+ "error: remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+ }
+ ext3_commit_super(sb, es, 1);
if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs (device %s): panic forced after error\n",
+ panic("EXT3-fs (%s): panic forced after error\n",
sb->s_id);
- ext3_commit_super(sb, es, 1);
}
-void ext3_error (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
ext3_handle_error(sb);
@@ -240,8 +257,7 @@ void __ext3_std_error (struct super_block * sb, const char * function,
return;
errstr = ext3_decode_error(sb, errno, nbuf);
- printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
- sb->s_id, function, errstr);
+ ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
ext3_handle_error(sb);
}
@@ -256,42 +272,57 @@ void __ext3_std_error (struct super_block * sb, const char * function,
* case we take the easy way out and panic immediately.
*/
-void ext3_abort (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- printk (KERN_CRIT "ext3_abort called.\n");
-
va_start(args, fmt);
- printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs panic from previous error\n");
+ panic("EXT3-fs: panic from previous error\n");
if (sb->s_flags & MS_RDONLY)
return;
- printk(KERN_CRIT "Remounting filesystem read-only\n");
+ ext3_msg(sb, KERN_CRIT,
+ "error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
- journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+
+ if (EXT3_SB(sb)->s_journal)
+ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
-void ext3_warning (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_warning(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
- sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
}
@@ -302,10 +333,10 @@ void ext3_update_dynamic_rev(struct super_block *sb)
if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
return;
- ext3_warning(sb, __FUNCTION__,
- "updating to rev %d because of new feature flag, "
- "running e2fsck is recommended",
- EXT3_DYNAMIC_REV);
+ ext3_msg(sb, KERN_WARNING,
+ "warning: updating to rev %d because of "
+ "new feature flag, running e2fsck is recommended",
+ EXT3_DYNAMIC_REV);
es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
@@ -323,42 +354,39 @@ void ext3_update_dynamic_rev(struct super_block *sb)
/*
* Open the external journal device
*/
-static struct block_device *ext3_blkdev_get(dev_t dev)
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
fail:
- printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
+ __bdevname(dev, b), PTR_ERR(bdev));
+
return NULL;
}
/*
* Release the journal device
*/
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext3_blkdev_put(bdev);
+ ext3_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -370,16 +398,16 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
{
struct list_head *l;
- printk(KERN_ERR "sb orphan head is %d\n",
+ ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
le32_to_cpu(sbi->s_es->s_last_orphan));
- printk(KERN_ERR "sb_info orphan list:\n");
+ ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
list_for_each(l, &sbi->s_orphan) {
struct inode *inode = orphan_list_entry(l);
- printk(KERN_ERR " "
- "inode %s:%ld at %p: mode %o, nlink %d, next %d\n",
+ ext3_msg(sb, KERN_ERR, " "
+ "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
inode->i_sb->s_id, inode->i_ino, inode,
- inode->i_mode, inode->i_nlink,
+ inode->i_mode, inode->i_nlink,
NEXT_ORPHAN(inode));
}
}
@@ -388,10 +416,15 @@ static void ext3_put_super (struct super_block * sb)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
struct ext3_super_block *es = sbi->s_es;
- int i;
+ int i, err;
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
+ err = journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ if (err < 0)
+ ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -420,7 +453,7 @@ static void ext3_put_super (struct super_block * sb)
dump_orphan_list(sb, sbi);
J_ASSERT(list_empty(&sbi->s_orphan));
- invalidate_bdev(sb->s_bdev, 0);
+ invalidate_bdev(sb->s_bdev);
if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
/*
* Invalidate the journal device's buffers. We don't want them
@@ -428,15 +461,15 @@ static void ext3_put_super (struct super_block * sb)
* hotswapped, and it breaks the `ro-after' testing code.
*/
sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev, 0);
+ invalidate_bdev(sbi->journal_bdev);
ext3_blkdev_remove(sbi);
}
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
- return;
}
-static kmem_cache_t *ext3_inode_cachep;
+static struct kmem_cache *ext3_inode_cachep;
/*
* Called inside transaction, so use GFP_NOFS
@@ -445,44 +478,62 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
{
struct ext3_inode_info *ei;
- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- ei->i_acl = EXT3_ACL_NOT_CACHED;
- ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+ atomic_set(&ei->i_datasync_tid, 0);
+ atomic_set(&ei->i_sync_tid, 0);
return &ei->vfs_inode;
}
-static void ext3_destroy_inode(struct inode *inode)
+static int ext3_drop_inode(struct inode *inode)
{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext3_drop_inode(inode, drop);
+ return drop;
+}
+
+static void ext3_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
}
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void ext3_destroy_inode(struct inode *inode)
+{
+ if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
+ printk("EXT3 Inode %p: orphan list check failed!\n",
+ EXT3_I(inode));
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+ EXT3_I(inode), sizeof(struct ext3_inode_info),
+ false);
+ dump_stack();
+ }
+ call_rcu(&inode->i_rcu, ext3_i_callback);
+}
+
+static void init_once(void *foo)
{
struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->i_orphan);
#ifdef CONFIG_EXT3_FS_XATTR
- init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->xattr_sem);
#endif
- init_MUTEX(&ei->truncate_sem);
- inode_init_once(&ei->vfs_inode);
- }
+ mutex_init(&ei->truncate_mutex);
+ inode_init_once(&ei->vfs_inode);
}
-
-static int init_inodecache(void)
+
+static int __init init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
sizeof(struct ext3_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT,
- init_once, NULL);
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
if (ext3_inode_cachep == NULL)
return -ENOMEM;
return 0;
@@ -490,28 +541,12 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
- if (kmem_cache_destroy(ext3_inode_cachep))
- printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
-}
-
-static void ext3_clear_inode(struct inode *inode)
-{
- struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- if (EXT3_I(inode)->i_acl &&
- EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
- posix_acl_release(EXT3_I(inode)->i_acl);
- EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
- }
- if (EXT3_I(inode)->i_default_acl &&
- EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
- posix_acl_release(EXT3_I(inode)->i_default_acl);
- EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
- }
-#endif
- ext3_discard_reservation(inode);
- EXT3_I(inode)->i_block_alloc_info = NULL;
- kfree(rsv);
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(ext3_inode_cachep);
}
static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -519,9 +554,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
#if defined(CONFIG_QUOTA)
struct ext3_sb_info *sbi = EXT3_SB(sb);
- if (sbi->s_jquota_fmt)
- seq_printf(seq, ",jqfmt=%s",
- (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
if (sbi->s_qf_names[USRQUOTA])
seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -529,97 +577,239 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
- if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+ if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
- if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+ if (test_opt(sb, GRPQUOTA))
seq_puts(seq, ",grpquota");
#endif
}
-static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static char *data_mode_string(unsigned long mode)
+{
+ switch (mode) {
+ case EXT3_MOUNT_JOURNAL_DATA:
+ return "journal";
+ case EXT3_MOUNT_ORDERED_DATA:
+ return "ordered";
+ case EXT3_MOUNT_WRITEBACK_DATA:
+ return "writeback";
+ }
+ return "unknown";
+}
+
+/*
+ * Show an option if
+ * - it's set to a non-default value OR
+ * - if the per-sb default is different from the global default
+ */
+static int ext3_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = root->d_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_super_block *es = sbi->s_es;
+ unsigned long def_mount_opts;
+
+ def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+ if (sbi->s_sb_block != 1)
+ seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
+ if (test_opt(sb, MINIX_DF))
+ seq_puts(seq, ",minixdf");
+ if (test_opt(sb, GRPID))
+ seq_puts(seq, ",grpid");
+ if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
+ seq_puts(seq, ",nogrpid");
+ if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
+ le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
+ seq_printf(seq, ",resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
+ }
+ if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
+ le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
+ seq_printf(seq, ",resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
+ }
+ if (test_opt(sb, ERRORS_RO)) {
+ int def_errors = le16_to_cpu(es->s_errors);
+
+ if (def_errors == EXT3_ERRORS_PANIC ||
+ def_errors == EXT3_ERRORS_CONTINUE) {
+ seq_puts(seq, ",errors=remount-ro");
+ }
+ }
+ if (test_opt(sb, ERRORS_CONT))
+ seq_puts(seq, ",errors=continue");
+ if (test_opt(sb, ERRORS_PANIC))
+ seq_puts(seq, ",errors=panic");
+ if (test_opt(sb, NO_UID32))
+ seq_puts(seq, ",nouid32");
+ if (test_opt(sb, DEBUG))
+ seq_puts(seq, ",debug");
+#ifdef CONFIG_EXT3_FS_XATTR
+ if (test_opt(sb, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ if (!test_opt(sb, XATTR_USER) &&
+ (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
+ seq_puts(seq, ",nouser_xattr");
+ }
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+ if (test_opt(sb, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
+ seq_puts(seq, ",noacl");
+#endif
+ if (!test_opt(sb, RESERVATION))
+ seq_puts(seq, ",noreservation");
+ if (sbi->s_commit_interval) {
+ seq_printf(seq, ",commit=%u",
+ (unsigned) (sbi->s_commit_interval / HZ));
+ }
+
+ /*
+ * Always display barrier state so it's clear what the status is.
+ */
+ seq_puts(seq, ",barrier=");
+ seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+ seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
+ if (test_opt(sb, DATA_ERR_ABORT))
+ seq_puts(seq, ",data_err=abort");
- if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
ext3_show_quota_options(seq, sb);
return 0;
}
+
+static struct inode *ext3_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct inode *inode;
+
+ if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
+ return ERR_PTR(-ESTALE);
+ if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
+ return ERR_PTR(-ESTALE);
+
+ /* iget isn't really right if the inode is currently unallocated!!
+ *
+ * ext3_read_inode will return a bad_inode if the inode had been
+ * deleted, so we should be safe.
+ *
+ * Currently we don't know the generation for parent directory, so
+ * a generation of 0 means "accept any"
+ */
+ inode = ext3_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (generation && inode->i_generation != generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+}
+
+static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ ext3_nfs_get_inode);
+}
+
+static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ ext3_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device. Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+ gfp_t wait)
+{
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return journal_try_to_free_buffers(journal, page,
+ wait & ~__GFP_WAIT);
+ return try_to_free_buffers(page);
+}
+
#ifdef CONFIG_QUOTA
#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
-static int ext3_dquot_initialize(struct inode *inode, int type);
-static int ext3_dquot_drop(struct inode *inode);
static int ext3_write_dquot(struct dquot *dquot);
static int ext3_acquire_dquot(struct dquot *dquot);
static int ext3_release_dquot(struct dquot *dquot);
static int ext3_mark_dquot_dirty(struct dquot *dquot);
static int ext3_write_info(struct super_block *sb, int type);
-static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext3_quota_on(struct super_block *sb, int type, int format_id,
+ struct path *path);
static int ext3_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext3_quota_operations = {
- .initialize = ext3_dquot_initialize,
- .drop = ext3_dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
+static const struct dquot_operations ext3_quota_operations = {
.write_dquot = ext3_write_dquot,
.acquire_dquot = ext3_acquire_dquot,
.release_dquot = ext3_release_dquot,
.mark_dirty = ext3_mark_dquot_dirty,
- .write_info = ext3_write_info
+ .write_info = ext3_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext3_qctl_operations = {
+static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk
+ .quota_off = dquot_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
};
#endif
-static struct super_operations ext3_sops = {
+static const struct super_operations ext3_sops = {
.alloc_inode = ext3_alloc_inode,
.destroy_inode = ext3_destroy_inode,
- .read_inode = ext3_read_inode,
.write_inode = ext3_write_inode,
.dirty_inode = ext3_dirty_inode,
- .delete_inode = ext3_delete_inode,
+ .drop_inode = ext3_drop_inode,
+ .evict_inode = ext3_evict_inode,
.put_super = ext3_put_super,
- .write_super = ext3_write_super,
.sync_fs = ext3_sync_fs,
- .write_super_lockfs = ext3_write_super_lockfs,
- .unlockfs = ext3_unlockfs,
+ .freeze_fs = ext3_freeze,
+ .unfreeze_fs = ext3_unfreeze,
.statfs = ext3_statfs,
.remount_fs = ext3_remount,
- .clear_inode = ext3_clear_inode,
.show_options = ext3_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext3_quota_read,
.quota_write = ext3_quota_write,
#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
};
-static struct export_operations ext3_export_ops = {
+static const struct export_operations ext3_export_ops = {
+ .fh_to_dentry = ext3_fh_to_dentry,
+ .fh_to_parent = ext3_fh_to_parent,
.get_parent = ext3_get_parent,
};
@@ -628,16 +818,18 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_journal_path,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_resize, Opt_usrquota, Opt_grpquota
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bsd_df, "bsddf"},
{Opt_minix_df, "minixdf"},
{Opt_grpid, "grpid"},
@@ -663,41 +855,50 @@ static match_table_t tokens = {
{Opt_reservation, "reservation"},
{Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
+ {Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
{Opt_grpjquota, "grpjquota=%s"},
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
{Opt_grpquota, "grpquota"},
{Opt_noquota, "noquota"},
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
- {Opt_err, NULL},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_resize, "resize"},
+ {Opt_err, NULL},
};
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
{
- unsigned long sb_block;
- char *options = (char *) *data;
+ ext3_fsblk_t sb_block;
+ char *options = (char *) *data;
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
options += 3;
+ /*todo: use simple_strtoll with >32bit ext3 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
- printk("EXT3-fs: Invalid sb specification: %s\n",
+ ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
(char *) *data);
return 1;
}
@@ -707,18 +908,83 @@ static unsigned long get_sb_block(void **data)
return sb_block;
}
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char *qname;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return 0;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext3_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype]) {
+ int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
+ kfree(qname);
+ if (!same) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ }
+ return same;
+ }
+ if (strchr(qname, '/')) {
+ ext3_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ kfree(qname);
+ return 0;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ set_opt(sbi->s_mount_opt, QUOTA);
+ return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype]) {
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ }
+ return 1;
+}
+#endif
+
static int parse_options (char *options, struct super_block *sb,
- unsigned long *inum, unsigned long *journal_devnum,
- unsigned long *n_blocks_count, int is_remount)
+ unsigned int *inum, unsigned long *journal_devnum,
+ ext3_fsblk_t *n_blocks_count, int is_remount)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
char * p;
substring_t args[MAX_OPT_ARGS];
int data_opt = 0;
int option;
+ kuid_t uid;
+ kgid_t gid;
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
#ifdef CONFIG_QUOTA
- int qtype;
- char *qname;
+ int qfmt;
#endif
if (!options)
@@ -728,7 +994,11 @@ static int parse_options (char *options, struct super_block *sb,
int token;
if (!*p)
continue;
-
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
@@ -746,12 +1016,23 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_resuid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resuid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+ return 0;
+
+ }
+ sbi->s_resuid = uid;
break;
case Opt_resgid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resgid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+ return 0;
+ }
+ sbi->s_resgid = gid;
break;
case Opt_sb:
/* handled by get_sb_block() instead of here */
@@ -782,10 +1063,12 @@ static int parse_options (char *options, struct super_block *sb,
set_opt (sbi->s_mount_opt, DEBUG);
break;
case Opt_oldalloc:
- set_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated oldalloc option");
break;
case Opt_orlov:
- clear_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated orlov option");
break;
#ifdef CONFIG_EXT3_FS_XATTR
case Opt_user_xattr:
@@ -797,7 +1080,8 @@ static int parse_options (char *options, struct super_block *sb,
#else
case Opt_user_xattr:
case Opt_nouser_xattr:
- printk("EXT3 (no)user_xattr options not supported\n");
+ ext3_msg(sb, KERN_INFO,
+ "(no)user_xattr options not supported");
break;
#endif
#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -810,7 +1094,8 @@ static int parse_options (char *options, struct super_block *sb,
#else
case Opt_acl:
case Opt_noacl:
- printk("EXT3 (no)acl options not supported\n");
+ ext3_msg(sb, KERN_INFO,
+ "(no)acl options not supported");
break;
#endif
case Opt_reservation:
@@ -826,16 +1111,16 @@ static int parse_options (char *options, struct super_block *sb,
user to specify an existing inode to be the
journal file. */
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
break;
case Opt_journal_inum:
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
if (match_int(&args[0], &option))
@@ -844,14 +1129,49 @@ static int parse_options (char *options, struct super_block *sb,
break;
case Opt_journal_dev:
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
if (match_int(&args[0], &option))
return 0;
*journal_devnum = option;
break;
+ case Opt_journal_path:
+ if (is_remount) {
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
+ return 0;
+ }
+
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext3_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return 0;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext3_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return 0;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext3_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return 0;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ break;
case Opt_noload:
set_opt (sbi->s_mount_opt, NOLOAD);
break;
@@ -874,80 +1194,62 @@ static int parse_options (char *options, struct super_block *sb,
data_opt = EXT3_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
- != data_opt) {
- printk(KERN_ERR
- "EXT3-fs: cannot change data "
- "mode on remount\n");
- return 0;
- }
+ if (test_opt(sb, DATA_FLAGS) == data_opt)
+ break;
+ ext3_msg(sb, KERN_ERR,
+ "error: cannot change "
+ "data mode on remount. The filesystem "
+ "is mounted in data=%s mode and you "
+ "try to remount it in data=%s mode.",
+ data_mode_string(test_opt(sb,
+ DATA_FLAGS)),
+ data_mode_string(data_opt));
+ return 0;
} else {
- sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
+ clear_opt(sbi->s_mount_opt, DATA_FLAGS);
sbi->s_mount_opt |= data_opt;
}
break;
+ case Opt_data_err_abort:
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
+ case Opt_data_err_ignore:
+ clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
- qtype = USRQUOTA;
- goto set_qf_name;
- case Opt_grpjquota:
- qtype = GRPQUOTA;
-set_qf_name:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR
- "EXT3-fs: Cannot change journalled "
- "quota options when quota turned on.\n");
- return 0;
- }
- qname = match_strdup(&args[0]);
- if (!qname) {
- printk(KERN_ERR
- "EXT3-fs: not enough memory for "
- "storing quotafile name.\n");
+ if (!set_qf_name(sb, USRQUOTA, &args[0]))
return 0;
- }
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- printk(KERN_ERR
- "EXT3-fs: %s quota file already "
- "specified.\n", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
- printk(KERN_ERR
- "EXT3-fs: quotafile must be on "
- "filesystem root.\n");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ break;
+ case Opt_grpjquota:
+ if (!set_qf_name(sb, GRPQUOTA, &args[0]))
return 0;
- }
- set_opt(sbi->s_mount_opt, QUOTA);
break;
case Opt_offusrjquota:
- qtype = USRQUOTA;
- goto clear_qf_name;
+ if (!clear_qf_name(sb, USRQUOTA))
+ return 0;
+ break;
case Opt_offgrpjquota:
- qtype = GRPQUOTA;
-clear_qf_name:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR "EXT3-fs: Cannot change "
- "journalled quota options when "
- "quota turned on.\n");
+ if (!clear_qf_name(sb, GRPQUOTA))
return 0;
- }
- /*
- * The space will be released later when all options
- * are confirmed to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
break;
case Opt_jqfmt_vfsold:
- sbi->s_jquota_fmt = QFMT_VFS_OLD;
- break;
+ qfmt = QFMT_VFS_OLD;
+ goto set_qf_format;
case Opt_jqfmt_vfsv0:
- sbi->s_jquota_fmt = QFMT_VFS_V0;
+ qfmt = QFMT_VFS_V0;
+ goto set_qf_format;
+ case Opt_jqfmt_vfsv1:
+ qfmt = QFMT_VFS_V1;
+set_qf_format:
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != qfmt) {
+ ext3_msg(sb, KERN_ERR, "error: cannot change "
+ "journaled quota options when "
+ "quota turned on.");
+ return 0;
+ }
+ sbi->s_jquota_fmt = qfmt;
break;
case Opt_quota:
case Opt_usrquota:
@@ -959,9 +1261,9 @@ clear_qf_name:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb)) {
- printk(KERN_ERR "EXT3-fs: Cannot change quota "
- "options when quota turned on.\n");
+ if (sb_any_quota_loaded(sb)) {
+ ext3_msg(sb, KERN_ERR, "error: cannot change "
+ "quota options when quota turned on.");
return 0;
}
clear_opt(sbi->s_mount_opt, QUOTA);
@@ -972,15 +1274,19 @@ clear_qf_name:
case Opt_quota:
case Opt_usrquota:
case Opt_grpquota:
+ ext3_msg(sb, KERN_ERR,
+ "error: quota options not supported.");
+ break;
case Opt_usrjquota:
case Opt_grpjquota:
case Opt_offusrjquota:
case Opt_offgrpjquota:
case Opt_jqfmt_vfsold:
case Opt_jqfmt_vfsv0:
- printk(KERN_ERR
- "EXT3-fs: journalled quota options not "
- "supported.\n");
+ case Opt_jqfmt_vfsv1:
+ ext3_msg(sb, KERN_ERR,
+ "error: journaled quota options not "
+ "supported.");
break;
case Opt_noquota:
break;
@@ -988,9 +1294,15 @@ clear_qf_name:
case Opt_abort:
set_opt(sbi->s_mount_opt, ABORT);
break;
+ case Opt_nobarrier:
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
case Opt_barrier:
- if (match_int(&args[0], &option))
- return 0;
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
if (option)
set_opt(sbi->s_mount_opt, BARRIER);
else
@@ -1000,8 +1312,9 @@ clear_qf_name:
break;
case Opt_resize:
if (!is_remount) {
- printk("EXT3-fs: resize option only available "
- "for remount\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: resize option only available "
+ "for remount");
return 0;
}
if (match_int(&args[0], &option) != 0)
@@ -1009,44 +1322,43 @@ clear_qf_name:
*n_blocks_count = option;
break;
case Opt_nobh:
- set_opt(sbi->s_mount_opt, NOBH);
+ ext3_msg(sb, KERN_WARNING,
+ "warning: ignoring deprecated nobh option");
+ break;
+ case Opt_bh:
+ ext3_msg(sb, KERN_WARNING,
+ "warning: ignoring deprecated bh option");
break;
default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ ext3_msg(sb, KERN_ERR,
+ "error: unrecognized mount option \"%s\" "
+ "or missing value", p);
return 0;
}
}
#ifdef CONFIG_QUOTA
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
- sbi->s_qf_names[USRQUOTA])
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sbi->s_mount_opt, USRQUOTA);
-
- if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
- sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
clear_opt(sbi->s_mount_opt, GRPQUOTA);
- if ((sbi->s_qf_names[USRQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
- (sbi->s_qf_names[GRPQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
- printk(KERN_ERR "EXT3-fs: old and new quota "
- "format mixing.\n");
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+ ext3_msg(sb, KERN_ERR, "error: old and new quota "
+ "format mixing.");
return 0;
}
if (!sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journalled quota format "
- "not specified.\n");
+ ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+ "not specified.");
return 0;
}
} else {
if (sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journalled quota format "
- "specified with no journalling "
- "enabled.\n");
+ ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+ "specified with no journaling "
+ "enabled.");
return 0;
}
}
@@ -1061,86 +1373,89 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
int res = 0;
if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
- printk (KERN_ERR "EXT3-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: revision level too high, "
+ "forcing read-only mode");
res = MS_RDONLY;
}
if (read_only)
return res;
if (!(sbi->s_mount_state & EXT3_VALID_FS))
- printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
else if ((sbi->s_mount_state & EXT3_ERROR_FS))
- printk (KERN_WARNING
- "EXT3-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ ext3_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >=
- (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk (KERN_WARNING
- "EXT3-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ le16_to_cpu(es->s_max_mnt_count))
+ ext3_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
(le32_to_cpu(es->s_lastcheck) +
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk (KERN_WARNING
- "EXT3-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
#if 0
/* @@@ We _will_ want to clear the valid bit if we find
inconsistencies, to force a fsck at reboot. But for
a plain journaled filesystem we can keep it set as
valid forever! :) */
- es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
+ es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
#endif
- if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+ if (!le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
- es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+ le16_add_cpu(&es->s_mnt_count, 1);
es->s_mtime = cpu_to_le32(get_seconds());
ext3_update_dynamic_rev(sb);
EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
ext3_commit_super(sb, es, 1);
if (test_opt(sb, DEBUG))
- printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
- "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
+ "bpg=%lu, ipg=%lu, mo=%04lx]",
sb->s_blocksize,
sbi->s_groups_count,
EXT3_BLOCKS_PER_GROUP(sb),
EXT3_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
char b[BDEVNAME_SIZE];
-
- printk("external journal on %s\n",
+ ext3_msg(sb, KERN_INFO, "using external journal on %s",
bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
} else {
- printk("internal journal\n");
+ ext3_msg(sb, KERN_INFO, "using internal journal");
}
+ cleancache_init_fs(sb);
return res;
}
/* Called at mount-time, super-block is locked */
-static int ext3_check_descriptors (struct super_block * sb)
+static int ext3_check_descriptors(struct super_block *sb)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
- unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
- struct ext3_group_desc * gdp = NULL;
- int desc_block = 0;
int i;
ext3_debug ("Checking group descriptors");
- for (i = 0; i < sbi->s_groups_count; i++)
- {
- if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
- gdp = (struct ext3_group_desc *)
- sbi->s_group_desc[desc_block++]->b_data;
- if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
- le32_to_cpu(gdp->bg_block_bitmap) >=
- block + EXT3_BLOCKS_PER_GROUP(sb))
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
+ ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
+ ext3_fsblk_t last_block;
+
+ if (i == sbi->s_groups_count - 1)
+ last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+ else
+ last_block = first_block +
+ (EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
+ if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+ le32_to_cpu(gdp->bg_block_bitmap) > last_block)
{
ext3_error (sb, "ext3_check_descriptors",
"Block bitmap for group %d"
@@ -1149,9 +1464,8 @@ static int ext3_check_descriptors (struct super_block * sb)
le32_to_cpu(gdp->bg_block_bitmap));
return 0;
}
- if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
- le32_to_cpu(gdp->bg_inode_bitmap) >=
- block + EXT3_BLOCKS_PER_GROUP(sb))
+ if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+ le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
{
ext3_error (sb, "ext3_check_descriptors",
"Inode bitmap for group %d"
@@ -1160,9 +1474,9 @@ static int ext3_check_descriptors (struct super_block * sb)
le32_to_cpu(gdp->bg_inode_bitmap));
return 0;
}
- if (le32_to_cpu(gdp->bg_inode_table) < block ||
- le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
- block + EXT3_BLOCKS_PER_GROUP(sb))
+ if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
+ last_block)
{
ext3_error (sb, "ext3_check_descriptors",
"Inode table for group %d"
@@ -1171,8 +1485,6 @@ static int ext3_check_descriptors (struct super_block * sb)
le32_to_cpu(gdp->bg_inode_table));
return 0;
}
- block += EXT3_BLOCKS_PER_GROUP(sb);
- gdp++;
}
sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
@@ -1211,18 +1523,32 @@ static void ext3_orphan_cleanup (struct super_block * sb,
return;
}
+ if (bdev_read_only(sb->s_bdev)) {
+ ext3_msg(sb, KERN_ERR, "error: write access "
+ "unavailable, skipping orphan cleanup.");
+ return;
+ }
+
+ /* Check if feature set allows readwrite operations */
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+ ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
- sb->s_id);
+ ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
sb->s_flags &= ~MS_RDONLY;
}
#ifdef CONFIG_QUOTA
@@ -1233,9 +1559,9 @@ static void ext3_orphan_cleanup (struct super_block * sb,
if (EXT3_SB(sb)->s_qf_names[i]) {
int ret = ext3_quota_on_mount(sb, i);
if (ret < 0)
- printk(KERN_ERR
- "EXT3-fs: Cannot turn on journalled "
- "quota: error %d\n", ret);
+ ext3_msg(sb, KERN_ERR,
+ "error: cannot turn on journaled "
+ "quota: %d", ret);
}
}
#endif
@@ -1243,27 +1569,27 @@ static void ext3_orphan_cleanup (struct super_block * sb,
while (es->s_last_orphan) {
struct inode *inode;
- if (!(inode =
- ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
es->s_last_orphan = 0;
break;
}
list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
- DQUOT_INIT(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
printk(KERN_DEBUG
- "%s: truncating inode %ld to %Ld bytes\n",
- __FUNCTION__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+ "%s: truncating inode %lu to %Ld bytes\n",
+ __func__, inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
inode->i_ino, inode->i_size);
ext3_truncate(inode);
nr_truncates++;
} else {
printk(KERN_DEBUG
- "%s: deleting unreferenced inode %ld\n",
- __FUNCTION__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %ld\n",
+ "%s: deleting unreferenced inode %lu\n",
+ __func__, inode->i_ino);
+ jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
}
@@ -1273,23 +1599,21 @@ static void ext3_orphan_cleanup (struct super_block * sb,
#define PLURAL(x) (x), ((x)==1) ? "" : "s"
if (nr_orphans)
- printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
- sb->s_id, PLURAL(nr_orphans));
+ ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
if (nr_truncates)
- printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
- sb->s_id, PLURAL(nr_truncates));
+ ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
- vfs_quota_off(sb, i);
+ dquot_quota_off(sb, i);
}
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
}
-#define log2(n) ffz(~(n))
-
/*
* Maximal file size. There is a direct, and {,double-,triple-}indirect
* block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
@@ -1298,11 +1622,31 @@ static void ext3_orphan_cleanup (struct super_block * sb,
static loff_t ext3_max_size(int bits)
{
loff_t res = EXT3_NDIR_BLOCKS;
- /* This constant is calculated to be the largest file size for a
- * dense, 4k-blocksize file such that the total number of
+ int meta_blocks;
+ loff_t upper_limit;
+
+ /* This is calculated to be the largest file size for a
+ * dense, file such that the total number of
* sectors in the file, including data and all indirect blocks,
- * does not exceed 2^32. */
- const loff_t upper_limit = 0x1ff7fffd000LL;
+ * does not exceed 2^32 -1
+ * __u32 i_blocks representing the total number of
+ * 512 bytes blocks of the file
+ */
+ upper_limit = (1LL << 32) - 1;
+
+ /* total blocks in file system block size */
+ upper_limit >>= (bits - 9);
+
+
+ /* indirect blocks */
+ meta_blocks = 1;
+ /* double indirect blocks */
+ meta_blocks += 1 + (1LL << (bits-2));
+ /* tripple indirect blocks */
+ meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+ upper_limit -= meta_blocks;
+ upper_limit <<= bits;
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
@@ -1310,18 +1654,21 @@ static loff_t ext3_max_size(int bits)
res <<= bits;
if (res > upper_limit)
res = upper_limit;
+
+ if (res > MAX_LFS_FILESIZE)
+ res = MAX_LFS_FILESIZE;
+
return res;
}
-static unsigned long descriptor_loc(struct super_block *sb,
- unsigned long logic_sb_block,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+ ext3_fsblk_t logic_sb_block,
int nr)
{
struct ext3_sb_info *sbi = EXT3_SB(sb);
- unsigned long bg, first_data_block, first_meta_bg;
+ unsigned long bg, first_meta_bg;
int has_super = 0;
- first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1330,7 +1677,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
bg = sbi->s_desc_per_block * nr;
if (ext3_bg_has_super(sb, bg))
has_super = 1;
- return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+ return (has_super + ext3_group_first_block_no(sb, bg));
}
@@ -1339,11 +1686,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
struct buffer_head * bh;
struct ext3_super_block *es = NULL;
struct ext3_sb_info *sbi;
- unsigned long block;
- unsigned long sb_block = get_sb_block(&data);
- unsigned long logic_sb_block;
+ ext3_fsblk_t block;
+ ext3_fsblk_t sb_block = get_sb_block(&data, sb);
+ ext3_fsblk_t logic_sb_block;
unsigned long offset = 0;
- unsigned long journal_inum = 0;
+ unsigned int journal_inum = 0;
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
@@ -1352,22 +1699,26 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
int db_count;
int i;
int needs_recovery;
+ int ret = -EINVAL;
__le32 features;
+ int err;
- sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- sb->s_fs_info = sbi;
- memset(sbi, 0, sizeof(*sbi));
- sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT3_DEF_RESUID;
- sbi->s_resgid = EXT3_DEF_RESGID;
- unlock_kernel();
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
+ sb->s_fs_info = sbi;
+ sbi->s_sb_block = sb_block;
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
+ ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
goto out_fail;
}
@@ -1383,14 +1734,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logic_sb_block))) {
- printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+ ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
goto out_fail;
}
/*
* Note: s_es must be initialized as soon as possible because
* some ext3 macro-instructions depend on its value
*/
- es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext3_super_block *) (bh->b_data + offset);
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1404,25 +1755,33 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, GRPID);
if (def_mount_opts & EXT3_DEFM_UID16)
set_opt(sbi->s_mount_opt, NO_UID32);
+#ifdef CONFIG_EXT3_FS_XATTR
if (def_mount_opts & EXT3_DEFM_XATTR_USER)
set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
if (def_mount_opts & EXT3_DEFM_ACL)
set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
- sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
- sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
- sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
+ set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ else
set_opt(sbi->s_mount_opt, ERRORS_RO);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ /* enable barriers by default */
+ set_opt(sbi->s_mount_opt, BARRIER);
set_opt(sbi->s_mount_opt, RESERVATION);
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
@@ -1430,15 +1789,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- printk(KERN_WARNING
- "EXT3-fs warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -1446,55 +1805,60 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
*/
features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
if (features) {
- printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount because of unsupported "
+ "optional features (%x)", le32_to_cpu(features));
goto failed_mount;
}
features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount RDWR because of unsupported "
+ "optional features (%x)", le32_to_cpu(features));
goto failed_mount;
}
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT3_MIN_BLOCK_SIZE ||
blocksize > EXT3_MAX_BLOCK_SIZE) {
- printk(KERN_ERR
- "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
- blocksize, sb->s_id);
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount because of unsupported "
+ "filesystem blocksize %d", blocksize);
goto failed_mount;
}
- hblock = bdev_hardsect_size(sb->s_bdev);
+ hblock = bdev_logical_block_size(sb->s_bdev);
if (sb->s_blocksize != blocksize) {
/*
* Make sure the blocksize for the filesystem is larger
* than the hardware sectorsize for the machine.
*/
if (blocksize < hblock) {
- printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
- "device blocksize %d.\n", blocksize, hblock);
+ ext3_msg(sb, KERN_ERR,
+ "error: fsblocksize %d too small for "
+ "hardware sectorsize %d", blocksize, hblock);
goto failed_mount;
}
brelse (bh);
- sb_set_blocksize(sb, blocksize);
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext3_msg(sb, KERN_ERR,
+ "error: bad blocksize %d", blocksize);
+ goto out_fail;
+ }
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if (!bh) {
- printk(KERN_ERR
- "EXT3-fs: Can't read superblock on 2nd try.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: can't read superblock on 2nd try");
goto failed_mount;
}
- es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+ es = (struct ext3_super_block *)(bh->b_data + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
- printk (KERN_ERR
- "EXT3-fs: Magic mismatch, very weird !\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: magic mismatch");
goto failed_mount;
}
}
@@ -1508,10 +1872,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
- (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
+ (!is_power_of_2(sbi->s_inode_size)) ||
(sbi->s_inode_size > blocksize)) {
- printk (KERN_ERR
- "EXT3-fs: unsupported inode size: %d\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: unsupported inode size: %d",
sbi->s_inode_size);
goto failed_mount;
}
@@ -1519,8 +1883,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
le32_to_cpu(es->s_log_frag_size);
if (blocksize != sbi->s_frag_size) {
- printk(KERN_ERR
- "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: fragsize %lu != blocksize %u (unsupported)",
sbi->s_frag_size, blocksize);
goto failed_mount;
}
@@ -1528,7 +1892,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT3_INODE_SIZE(sb) == 0)
+ if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
goto cantfind_ext3;
sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
@@ -1538,68 +1902,90 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
sbi->s_sbh = bh;
sbi->s_mount_state = le16_to_cpu(es->s_state);
- sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
- sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
+ sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
for (i=0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ }
if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #blocks per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_frags_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #fragments per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: #fragments per group too big: %lu",
sbi->s_frags_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #inodes per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: #inodes per group too big: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
+ err = generic_check_addressable(sb->s_blocksize_bits,
+ le32_to_cpu(es->s_blocks_count));
+ if (err) {
+ ext3_msg(sb, KERN_ERR,
+ "error: filesystem is too large to mount safely");
+ if (sizeof(sector_t) < 8)
+ ext3_msg(sb, KERN_ERR,
+ "error: CONFIG_LBDAF not enabled");
+ ret = err;
+ goto failed_mount;
+ }
+
if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext3;
- sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_first_data_block) +
- EXT3_BLOCKS_PER_GROUP(sb) - 1) /
- EXT3_BLOCKS_PER_GROUP(sb);
- db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
- EXT3_DESC_PER_BLOCK(sb);
+ sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) - 1)
+ / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
+ db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk (KERN_ERR "EXT3-fs: not enough memory\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: not enough memory");
+ ret = -ENOMEM;
goto failed_mount;
}
- percpu_counter_init(&sbi->s_freeblocks_counter);
- percpu_counter_init(&sbi->s_freeinodes_counter);
- percpu_counter_init(&sbi->s_dirs_counter);
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
- printk (KERN_ERR "EXT3-fs: "
- "can't read group descriptor %d\n", i);
+ ext3_msg(sb, KERN_ERR,
+ "error: can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
}
if (!ext3_check_descriptors (sb)) {
- printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: group descriptors corrupted");
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
+
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
sbi->s_rsv_window_root = RB_ROOT;
@@ -1623,7 +2009,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext3_qctl_operations;
sb->dq_op = &ext3_quota_operations;
#endif
+ memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+ mutex_init(&sbi->s_orphan_lock);
+ mutex_init(&sbi->s_resize_lock);
sb->s_root = NULL;
@@ -1644,11 +2033,26 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount2;
} else {
if (!silent)
- printk (KERN_ERR
- "ext3: No journal on filesystem on %s\n",
- sb->s_id);
+ ext3_msg(sb, KERN_ERR,
+ "error: no journal found. "
+ "mounting ext3 over ext2?");
goto failed_mount2;
}
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext3_count_free_blocks(sb));
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext3_count_free_inodes(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+ }
+ if (err) {
+ ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+ ret = err;
+ goto failed_mount3;
+ }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -1659,7 +2063,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
cope, else JOURNAL_DATA */
if (journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
else
set_opt(sbi->s_mount_opt, JOURNAL_DATA);
break;
@@ -1668,83 +2072,66 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
case EXT3_MOUNT_WRITEBACK_DATA:
if (!journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
- printk(KERN_ERR "EXT3-fs: Journal does not support "
- "requested data journaling mode\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: journal does not support "
+ "requested data journaling mode");
goto failed_mount3;
}
default:
break;
}
- if (test_opt(sb, NOBH)) {
- if (sb->s_blocksize_bits != PAGE_CACHE_SHIFT) {
- printk(KERN_WARNING "EXT3-fs: Ignoring nobh option "
- "since filesystem blocksize doesn't match "
- "pagesize\n");
- clear_opt(sbi->s_mount_opt, NOBH);
- }
- if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
- printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
- "its supported only with writeback mode\n");
- clear_opt(sbi->s_mount_opt, NOBH);
- }
- }
/*
* The journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
*/
- root = iget(sb, EXT3_ROOT_INO);
- sb->s_root = d_alloc_root(root);
- if (!sb->s_root) {
- printk(KERN_ERR "EXT3-fs: get root inode failed\n");
- iput(root);
+ root = ext3_iget(sb, EXT3_ROOT_INO);
+ if (IS_ERR(root)) {
+ ext3_msg(sb, KERN_ERR, "error: get root inode failed");
+ ret = PTR_ERR(root);
goto failed_mount3;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
- dput(sb->s_root);
- sb->s_root = NULL;
- printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
+ iput(root);
+ ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
+ goto failed_mount3;
+ }
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root) {
+ ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
+ ret = -ENOMEM;
goto failed_mount3;
}
- ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
- /*
- * akpm: core read_super() calls in here with the superblock locked.
- * That deadlocks, because orphan cleanup needs to lock the superblock
- * in numerous places. Here we just pop the lock - it's relatively
- * harmless, because we are now ready to accept write_super() requests,
- * and aviro says that's the only reason for hanging onto the
- * superblock lock.
- */
+ if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
+
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
- if (needs_recovery)
- printk (KERN_INFO "EXT3-fs: recovery complete.\n");
- ext3_mark_recovery_complete(sb, es);
- printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+ if (needs_recovery) {
+ ext3_mark_recovery_complete(sb, es);
+ ext3_msg(sb, KERN_INFO, "recovery complete");
+ }
+ ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- percpu_counter_mod(&sbi->s_freeblocks_counter,
- ext3_count_free_blocks(sb));
- percpu_counter_mod(&sbi->s_freeinodes_counter,
- ext3_count_free_inodes(sb));
- percpu_counter_mod(&sbi->s_dirs_counter,
- ext3_count_dirs(sb));
-
- lock_kernel();
return 0;
cantfind_ext3:
if (!silent)
- printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
+ ext3_msg(sb, KERN_INFO,
+ "error: can't find ext3 filesystem on dev %s.",
sb->s_id);
goto failed_mount;
failed_mount3:
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
journal_destroy(sbi->s_journal);
failed_mount2:
for (i = 0; i < db_count; i++)
@@ -1759,15 +2146,15 @@ failed_mount:
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
- lock_kernel();
- return -EINVAL;
+ return ret;
}
/*
* Setup any per-fs journal parameters now. We'll do this both on
* initial mount, once the journal has been initialised but before we've
- * done any recovery; and again on any subsequent remount.
+ * done any recovery; and again on any subsequent remount.
*/
static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
{
@@ -1784,10 +2171,15 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JFS_BARRIER;
else
journal->j_flags &= ~JFS_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
spin_unlock(&journal->j_state_lock);
}
-static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+static journal_t *ext3_get_journal(struct super_block *sb,
+ unsigned int journal_inum)
{
struct inode *journal_inode;
journal_t *journal;
@@ -1796,29 +2188,29 @@ static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
* things happen if we iget() an unused inode, as the subsequent
* iput() will try to delete it. */
- journal_inode = iget(sb, journal_inum);
- if (!journal_inode) {
- printk(KERN_ERR "EXT3-fs: no journal found.\n");
+ journal_inode = ext3_iget(sb, journal_inum);
+ if (IS_ERR(journal_inode)) {
+ ext3_msg(sb, KERN_ERR, "error: no journal found");
return NULL;
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
- printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+ ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
return NULL;
}
jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
journal_inode, journal_inode->i_size);
- if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
- printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+ if (!S_ISREG(journal_inode->i_mode)) {
+ ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
iput(journal_inode);
return NULL;
}
journal = journal_init_inode(journal_inode);
if (!journal) {
- printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
+ ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
iput(journal_inode);
return NULL;
}
@@ -1832,30 +2224,23 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
{
struct buffer_head * bh;
journal_t *journal;
- int start;
- int len;
+ ext3_fsblk_t start;
+ ext3_fsblk_t len;
int hblock, blocksize;
- unsigned long sb_block;
+ ext3_fsblk_t sb_block;
unsigned long offset;
struct ext3_super_block * es;
struct block_device *bdev;
- bdev = ext3_blkdev_get(j_dev);
+ bdev = ext3_blkdev_get(j_dev, sb);
if (bdev == NULL)
return NULL;
- if (bd_claim(bdev, sb)) {
- printk(KERN_ERR
- "EXT3: failed to claim external journal device.\n");
- blkdev_put(bdev);
- return NULL;
- }
-
blocksize = sb->s_blocksize;
- hblock = bdev_hardsect_size(bdev);
+ hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
- printk(KERN_ERR
- "EXT3-fs: blocksize too small for journal device.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: blocksize too small for journal device");
goto out_bdev;
}
@@ -1863,23 +2248,23 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
offset = EXT3_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
if (!(bh = __bread(bdev, sb_block, blocksize))) {
- printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
- "external journal\n");
+ ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
+ "external journal");
goto out_bdev;
}
- es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ es = (struct ext3_super_block *) (bh->b_data + offset);
if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- printk(KERN_ERR "EXT3-fs: external journal has "
- "bad superblock\n");
+ ext3_msg(sb, KERN_ERR, "error: external journal has "
+ "bad superblock");
brelse(bh);
goto out_bdev;
}
if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
- printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+ ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
brelse(bh);
goto out_bdev;
}
@@ -1891,19 +2276,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
journal = journal_init_dev(bdev, sb->s_bdev,
start, len, blocksize);
if (!journal) {
- printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: failed to create device journal");
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
- printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
- goto out_journal;
+ if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
+ if (bh_submit_read(journal->j_sb_buffer)) {
+ ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+ goto out_journal;
+ }
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
- printk(KERN_ERR "EXT3-fs: External journal has more than one "
- "user (unsupported) - %d\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: external journal has more than one "
+ "user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
@@ -1922,15 +2309,15 @@ static int ext3_load_journal(struct super_block *sb,
unsigned long journal_devnum)
{
journal_t *journal;
- int journal_inum = le32_to_cpu(es->s_journal_inum);
+ unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
dev_t journal_dev;
int err = 0;
int really_read_only;
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- printk(KERN_INFO "EXT3-fs: external journal device major/minor "
- "numbers have changed\n");
+ ext3_msg(sb, KERN_INFO, "external journal device major/minor "
+ "numbers have changed");
journal_dev = new_decode_dev(journal_devnum);
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -1945,21 +2332,21 @@ static int ext3_load_journal(struct super_block *sb,
if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT3-fs: INFO: recovery "
- "required on readonly filesystem.\n");
+ ext3_msg(sb, KERN_INFO,
+ "recovery required on readonly filesystem");
if (really_read_only) {
- printk(KERN_ERR "EXT3-fs: write access "
- "unavailable, cannot proceed.\n");
+ ext3_msg(sb, KERN_ERR, "error: write access "
+ "unavailable, cannot proceed");
return -EROFS;
}
- printk (KERN_INFO "EXT3-fs: write access will "
- "be enabled during recovery.\n");
+ ext3_msg(sb, KERN_INFO,
+ "write access will be enabled during recovery");
}
}
if (journal_inum && journal_dev) {
- printk(KERN_ERR "EXT3-fs: filesystem has both journal "
- "and inode journals!\n");
+ ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
+ "and inode journals");
return -EINVAL;
}
@@ -1971,10 +2358,13 @@ static int ext3_load_journal(struct super_block *sb,
return -EINVAL;
}
+ if (!(journal->j_flags & JFS_BARRIER))
+ printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
+
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
err = journal_update_format(journal);
if (err) {
- printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+ ext3_msg(sb, KERN_ERR, "error updating journal");
journal_destroy(journal);
return err;
}
@@ -1986,7 +2376,7 @@ static int ext3_load_journal(struct super_block *sb,
err = journal_load(journal);
if (err) {
- printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+ ext3_msg(sb, KERN_ERR, "error loading journal");
journal_destroy(journal);
return err;
}
@@ -1994,10 +2384,9 @@ static int ext3_load_journal(struct super_block *sb,
EXT3_SB(sb)->s_journal = journal;
ext3_clear_journal_err(sb, es);
- if (journal_devnum &&
+ if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
- sb->s_dirt = 1;
/* Make sure we flush the recovery flag to disk. */
ext3_commit_super(sb, es, 1);
@@ -2006,26 +2395,30 @@ static int ext3_load_journal(struct super_block *sb,
return 0;
}
-static int ext3_create_journal(struct super_block * sb,
- struct ext3_super_block * es,
- int journal_inum)
+static int ext3_create_journal(struct super_block *sb,
+ struct ext3_super_block *es,
+ unsigned int journal_inum)
{
journal_t *journal;
+ int err;
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
- "create journal.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: readonly filesystem when trying to "
+ "create journal");
return -EROFS;
}
- if (!(journal = ext3_get_journal(sb, journal_inum)))
+ journal = ext3_get_journal(sb, journal_inum);
+ if (!journal)
return -EINVAL;
- printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+ ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
journal_inum);
- if (journal_create(journal)) {
- printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+ err = journal_create(journal);
+ if (err) {
+ ext3_msg(sb, KERN_ERR, "error creating journal");
journal_destroy(journal);
return -EIO;
}
@@ -2037,7 +2430,6 @@ static int ext3_create_journal(struct super_block * sb,
EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
es->s_journal_inum = cpu_to_le32(journal_inum);
- sb->s_dirt = 1;
/* Make sure we flush the recovery flag to disk. */
ext3_commit_super(sb, es, 1);
@@ -2045,21 +2437,56 @@ static int ext3_create_journal(struct super_block * sb,
return 0;
}
-static void ext3_commit_super (struct super_block * sb,
- struct ext3_super_block * es,
+static int ext3_commit_super(struct super_block *sb,
+ struct ext3_super_block *es,
int sync)
{
struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+ int error = 0;
if (!sbh)
- return;
- es->s_wtime = cpu_to_le32(get_seconds());
+ return error;
+
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ ext3_msg(sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
- if (sync)
- sync_dirty_buffer(sbh);
+ if (sync) {
+ error = sync_dirty_buffer(sbh);
+ if (buffer_write_io_error(sbh)) {
+ ext3_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ }
+ return error;
}
@@ -2074,13 +2501,16 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
journal_t *journal = EXT3_SB(sb)->s_journal;
journal_lock_updates(journal);
- journal_flush(journal);
+ if (journal_flush(journal) < 0)
+ goto out;
+
if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
sb->s_flags & MS_RDONLY) {
EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- sb->s_dirt = 0;
ext3_commit_super(sb, es, 1);
}
+
+out:
journal_unlock_updates(journal);
}
@@ -2089,8 +2519,8 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext3_clear_journal_err(struct super_block * sb,
- struct ext3_super_block * es)
+static void ext3_clear_journal_err(struct super_block *sb,
+ struct ext3_super_block *es)
{
journal_t *journal;
int j_errno;
@@ -2108,9 +2538,9 @@ static void ext3_clear_journal_err(struct super_block * sb,
char nbuf[16];
errstr = ext3_decode_error(sb, j_errno, nbuf);
- ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
+ ext3_warning(sb, __func__, "Filesystem error recorded "
"from previous mount: %s", errstr);
- ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
+ ext3_warning(sb, __func__, "Marking fs in need of "
"filesystem check.");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
@@ -2134,32 +2564,20 @@ int ext3_force_commit(struct super_block *sb)
return 0;
journal = EXT3_SB(sb)->s_journal;
- sb->s_dirt = 0;
ret = ext3_journal_force_commit(journal);
return ret;
}
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point. Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
- */
-
-static void ext3_write_super (struct super_block * sb)
-{
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
-}
-
static int ext3_sync_fs(struct super_block *sb, int wait)
{
tid_t target;
- sb->s_dirt = 0;
+ trace_ext3_sync_fs(sb, wait);
+ /*
+ * Writeback quota in non-journalled quota case - journalled quota has
+ * no dirty dquots
+ */
+ dquot_writeback_dquots(sb, -1);
if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
if (wait)
log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2171,51 +2589,68 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
* LVM calls this function before a (read-only) snapshot is created. This
* gives us a chance to flush the journal completely and mark the fs clean.
*/
-static void ext3_write_super_lockfs(struct super_block *sb)
+static int ext3_freeze(struct super_block *sb)
{
- sb->s_dirt = 0;
+ int error = 0;
+ journal_t *journal;
if (!(sb->s_flags & MS_RDONLY)) {
- journal_t *journal = EXT3_SB(sb)->s_journal;
+ journal = EXT3_SB(sb)->s_journal;
/* Now we set up the journal barrier. */
journal_lock_updates(journal);
- journal_flush(journal);
+
+ /*
+ * We don't want to clear needs_recovery flag when we failed
+ * to flush the journal.
+ */
+ error = journal_flush(journal);
+ if (error < 0)
+ goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ if (error)
+ goto out;
}
+ return 0;
+
+out:
+ journal_unlock_updates(journal);
+ return error;
}
/*
* Called by LVM after the snapshot is done. We need to reset the RECOVER
* flag here, even though the filesystem is not technically dirty yet.
*/
-static void ext3_unlockfs(struct super_block *sb)
+static int ext3_unfreeze(struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY)) {
- lock_super(sb);
/* Reser the needs_recovery flag before the fs is unlocked. */
EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
- unlock_super(sb);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
}
+ return 0;
}
static int ext3_remount (struct super_block * sb, int * flags, char * data)
{
struct ext3_super_block * es;
struct ext3_sb_info *sbi = EXT3_SB(sb);
- unsigned long n_blocks_count = 0;
+ ext3_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext3_mount_options old_opts;
+ int enable_quota = 0;
int err;
#ifdef CONFIG_QUOTA
int i;
#endif
+ sync_filesystem(sb);
+
/* Store the original options */
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -2225,7 +2660,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
/*
@@ -2236,11 +2682,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
- ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+ if (test_opt(sb, ABORT))
+ ext3_abort(sb, __func__, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
es = sbi->s_es;
@@ -2248,12 +2694,16 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+ if (test_opt(sb, ABORT)) {
err = -EROFS;
goto restore_opts;
}
if (*flags & MS_RDONLY) {
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+
/*
* First of all, the unconditional stuff we have to do
* to disable replay of the journal when we next remount
@@ -2274,13 +2724,28 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
__le32 ret;
if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
~EXT3_FEATURE_RO_COMPAT_SUPP))) {
- printk(KERN_WARNING "EXT3-fs: %s: couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ ext3_msg(sb, KERN_WARNING,
+ "warning: couldn't remount RDWR "
+ "because of unsupported optional "
+ "features (%x)", le32_to_cpu(ret));
err = -EROFS;
goto restore_opts;
}
+
+ /*
+ * If we have an unprocessed orphan list hanging
+ * around from a previously readonly bdev mount,
+ * require a full umount & mount for now.
+ */
+ if (es->s_last_orphan) {
+ ext3_msg(sb, KERN_WARNING, "warning: couldn't "
+ "remount RDWR because of unprocessed "
+ "orphan inode list. Please "
+ "umount & mount instead.");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
/*
* Mounting a RDONLY partition read-write, so reread
* and store the current valid flag. (It may have
@@ -2289,21 +2754,20 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
*/
ext3_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if ((ret = ext3_group_extend(sb, es, n_blocks_count))) {
- err = ret;
+ if ((err = ext3_group_extend(sb, es, n_blocks_count)))
goto restore_opts;
- }
if (!ext3_setup_super (sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ enable_quota = 1;
}
}
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
#endif
+ if (enable_quota)
+ dquot_resume(sb, -1);
return 0;
restore_opts:
sb->s_flags = old_sb_flags;
@@ -2314,30 +2778,31 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
return err;
}
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
{
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
- unsigned long overhead;
- int i;
+ struct super_block *sb = dentry->d_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_super_block *es = sbi->s_es;
+ u64 fsid;
- if (test_opt (sb, MINIX_DF))
- overhead = 0;
- else {
- unsigned long ngroups;
- ngroups = EXT3_SB(sb)->s_groups_count;
+ if (test_opt(sb, MINIX_DF)) {
+ sbi->s_overhead_last = 0;
+ } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+ unsigned long ngroups = sbi->s_groups_count, i;
+ ext3_fsblk_t overhead = 0;
smp_rmb();
/*
- * Compute the overhead (FS structures)
+ * Compute the overhead (FS structures). This is constant
+ * for a given filesystem unless the number of block groups
+ * changes so we cache the previous value until it does.
*/
/*
@@ -2361,19 +2826,30 @@ static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
* Every block group has an inode bitmap, a block
* bitmap, and an inode table.
*/
- overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group));
+ overhead += ngroups * (2 + sbi->s_itb_per_group);
+
+ /* Add the journal blocks as well */
+ overhead += sbi->s_journal->j_maxlen;
+
+ sbi->s_overhead_last = overhead;
+ smp_wmb();
+ sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
}
buf->f_type = EXT3_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
- buf->f_bfree = ext3_count_free_blocks (sb);
+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
- buf->f_ffree = ext3_count_free_inodes (sb);
+ buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
buf->f_namelen = EXT3_NAME_LEN;
+ fsid = le64_to_cpup((void *)es->s_uuid) ^
+ le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
return 0;
}
@@ -2382,8 +2858,8 @@ static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
* Process 1 Process 2
* ext3_create() quota_sync()
* journal_start() write_dquot()
- * DQUOT_INIT() down(dqio_sem)
- * down(dqio_sem) journal_start()
+ * dquot_initialize() down(dqio_mutex)
+ * down(dqio_mutex) journal_start()
*
*/
@@ -2391,39 +2867,7 @@ static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
-}
-
-static int ext3_dquot_initialize(struct inode *inode, int type)
-{
- handle_t *handle;
- int ret, err;
-
- /* We may create quota structure so we need to reserve enough blocks */
- handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_initialize(inode, type);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-static int ext3_dquot_drop(struct inode *inode)
-{
- handle_t *handle;
- int ret, err;
-
- /* We may delete quota structure so we need to reserve enough blocks */
- handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_drop(inode);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}
static int ext3_write_dquot(struct dquot *dquot)
@@ -2467,8 +2911,11 @@ static int ext3_release_dquot(struct dquot *dquot)
handle = ext3_journal_start(dquot_to_inode(dquot),
EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
- if (IS_ERR(handle))
+ if (IS_ERR(handle)) {
+ /* Release dquot anyway to avoid endless cycle in dqput() */
+ dquot_release(dquot);
return PTR_ERR(handle);
+ }
ret = dquot_release(dquot);
err = ext3_journal_stop(handle);
if (!ret)
@@ -2478,7 +2925,7 @@ static int ext3_release_dquot(struct dquot *dquot)
static int ext3_mark_dquot_dirty(struct dquot *dquot)
{
- /* Are we journalling quotas? */
+ /* Are we journaling quotas? */
if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
@@ -2510,45 +2957,55 @@ static int ext3_write_info(struct super_block *sb, int type)
*/
static int ext3_quota_on_mount(struct super_block *sb, int type)
{
- return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
- EXT3_SB(sb)->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+ EXT3_SB(sb)->s_jquota_fmt, type);
}
/*
* Standard function to be called on quota_on
*/
static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- char *path)
+ struct path *path)
{
int err;
- struct nameidata nd;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* Not journalling quota? */
- if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
- !EXT3_SB(sb)->s_qf_names[GRPQUOTA])
- return vfs_quota_on(sb, type, format_id, path);
- err = path_lookup(path, LOOKUP_FOLLOW, &nd);
- if (err)
- return err;
+
/* Quotafile not on the same filesystem? */
- if (nd.mnt->mnt_sb != sb) {
- path_release(&nd);
+ if (path->dentry->d_sb != sb)
return -EXDEV;
+ /* Journaling quota? */
+ if (EXT3_SB(sb)->s_qf_names[type]) {
+ /* Quotafile not of fs root? */
+ if (path->dentry->d_parent != sb->s_root)
+ ext3_msg(sb, KERN_WARNING,
+ "warning: Quota file not on filesystem root. "
+ "Journaled quota will not work.");
+ }
+
+ /*
+ * When we journal data on quota file, we have to flush journal to see
+ * all updates to the file when we bypass pagecache...
+ */
+ if (ext3_should_journal_data(path->dentry->d_inode)) {
+ /*
+ * We don't need to lock updates but journal_flush() could
+ * otherwise be livelocked...
+ */
+ journal_lock_updates(EXT3_SB(sb)->s_journal);
+ err = journal_flush(EXT3_SB(sb)->s_journal);
+ journal_unlock_updates(EXT3_SB(sb)->s_journal);
+ if (err)
+ return err;
}
- /* Quotafile not of fs root? */
- if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
- printk(KERN_WARNING
- "EXT3-fs: Quota file not on filesystem root. "
- "Journalled quota will not work.\n");
- path_release(&nd);
- return vfs_quota_on(sb, type, format_id, path);
+
+ return dquot_quota_on(sb, type, format_id, path);
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
* we don't have to be afraid of races */
static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
@@ -2595,74 +3052,79 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
int err = 0;
int offset = off & (sb->s_blocksize - 1);
- int tocopy;
int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- mutex_lock(&inode->i_mutex);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
- bh = ext3_bread(handle, inode, blk, 1, &err);
- if (!bh)
+ if (!handle) {
+ ext3_msg(sb, KERN_WARNING,
+ "warning: quota write (off=%llu, len=%llu)"
+ " cancelled because transaction is not started.",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+ bh = ext3_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ if (journal_quota) {
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
goto out;
- if (journal_quota) {
- err = ext3_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
- }
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext3_journal_dirty_metadata(handle, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
}
- brelse(bh);
- if (err)
- goto out;
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
}
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ if (journal_quota)
+ err = ext3_journal_dirty_metadata(handle, bh);
+ else {
+ /* Always do at least ordered writes for quotas */
+ err = ext3_journal_dirty_data(handle, bh);
+ mark_buffer_dirty(bh);
+ }
+ brelse(bh);
out:
- if (len == towrite)
+ if (err)
return err;
- if (inode->i_size < off+len-towrite) {
- i_size_write(inode, off+len-towrite);
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
EXT3_I(inode)->i_disksize = inode->i_size;
}
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
- mutex_unlock(&inode->i_mutex);
- return len - towrite;
+ return len;
}
#endif
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+ return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
}
static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE,
.name = "ext3",
- .get_sb = ext3_get_sb,
+ .mount = ext3_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("ext3");
static int __init init_ext3_fs(void)
{
@@ -2679,7 +3141,7 @@ static int __init init_ext3_fs(void)
out:
destroy_inodecache();
out1:
- exit_ext3_xattr();
+ exit_ext3_xattr();
return err;
}
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 4f79122cde6..6b01c3eab1f 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,10 +17,8 @@
* ext3 symlink handling code
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/namei.h>
+#include "ext3.h"
#include "xattr.h"
static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -30,10 +28,11 @@ static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
return NULL;
}
-struct inode_operations ext3_symlink_inode_operations = {
+const struct inode_operations ext3_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -42,9 +41,10 @@ struct inode_operations ext3_symlink_inode_operations = {
#endif
};
-struct inode_operations ext3_fast_symlink_inode_operations = {
+const struct inode_operations ext3_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext3_follow_link,
+ .setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7d..c6874be6d58 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -50,14 +50,9 @@
* by the buffer lock.
*/
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/mbcache.h>
#include <linux/quotaops.h>
-#include <linux/rwsem.h>
#include "xattr.h"
#include "acl.h"
@@ -75,7 +70,7 @@
#ifdef EXT3_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
- printk(KERN_DEBUG "inode %s:%ld: ", \
+ printk(KERN_DEBUG "inode %s:%lu: ", \
inode->i_sb->s_id, inode->i_ino); \
printk(f); \
printk("\n"); \
@@ -99,14 +94,16 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *,
struct mb_cache_entry **);
static void ext3_xattr_rehash(struct ext3_xattr_header *,
struct ext3_xattr_entry *);
+static int ext3_xattr_list(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
static struct mb_cache *ext3_xattr_cache;
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
[EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
- [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_SECURITY
@@ -114,12 +111,12 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
#endif
};
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
&ext3_xattr_user_handler,
&ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- &ext3_xattr_acl_access_handler,
- &ext3_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT3_FS_SECURITY
&ext3_xattr_security_handler,
@@ -127,10 +124,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
NULL
};
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
ext3_xattr_handler(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
handler = ext3_xattr_handler_map[name_index];
@@ -145,7 +142,7 @@ ext3_xattr_handler(int name_index)
ssize_t
ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- return ext3_xattr_list(dentry->d_inode, buffer, size);
+ return ext3_xattr_list(dentry, buffer, size);
}
static int
@@ -225,15 +222,15 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
error = -ENODATA;
if (!EXT3_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
if (!bh)
goto cleanup;
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext3_xattr_check_block(bh)) {
-bad_block: ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: bad block %d", inode->i_ino,
+bad_block: ext3_error(inode->i_sb, __func__,
+ "inode %lu: bad block "E3FSBLK, inode->i_ino,
EXT3_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
@@ -272,7 +269,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return -ENODATA;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -330,19 +327,20 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
}
static int
-ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
+ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
char *buffer, size_t buffer_size)
{
size_t rest = buffer_size;
for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
- struct xattr_handler *handler =
+ const struct xattr_handler *handler =
ext3_xattr_handler(entry->e_name_index);
if (handler) {
- size_t size = handler->list(inode, buffer, rest,
+ size_t size = handler->list(dentry, buffer, rest,
entry->e_name,
- entry->e_name_len);
+ entry->e_name_len,
+ handler->flags);
if (buffer) {
if (size > rest)
return -ERANGE;
@@ -355,8 +353,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
}
static int
-ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
@@ -366,7 +365,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
error = 0;
if (!EXT3_I(inode)->i_file_acl)
goto cleanup;
- ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+ ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
error = -EIO;
if (!bh)
@@ -374,14 +373,14 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext3_xattr_check_block(bh)) {
- ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: bad block %d", inode->i_ino,
+ ext3_error(inode->i_sb, __func__,
+ "inode %lu: bad block "E3FSBLK, inode->i_ino,
EXT3_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
ext3_xattr_cache_insert(bh);
- error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+ error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
brelse(bh);
@@ -390,15 +389,16 @@ cleanup:
}
static int
-ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct ext3_xattr_ibody_header *header;
struct ext3_inode *raw_inode;
struct ext3_iloc iloc;
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return 0;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -409,7 +409,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
error = ext3_xattr_check_names(IFIRST(header), end);
if (error)
goto cleanup;
- error = ext3_xattr_list_entries(inode, IFIRST(header),
+ error = ext3_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
cleanup:
@@ -427,13 +427,13 @@ cleanup:
* Returns a negative error number on failure, or the number of bytes
* used / required on success.
*/
-int
-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+static int
+ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int i_error, b_error;
- down_read(&EXT3_I(inode)->xattr_sem);
- i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size);
+ down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
+ i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
if (i_error < 0) {
b_error = 0;
} else {
@@ -441,11 +441,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
buffer += i_error;
buffer_size -= i_error;
}
- b_error = ext3_xattr_block_list(inode, buffer, buffer_size);
+ b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
if (b_error < 0)
i_error = 0;
}
- up_read(&EXT3_I(inode)->xattr_sem);
+ up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
return i_error + b_error;
}
@@ -459,14 +459,10 @@ static void ext3_xattr_update_super_block(handle_t *handle,
if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
return;
- lock_super(sb);
if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
- EXT3_SB(sb)->s_es->s_feature_compat |=
- cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
- sb->s_dirt = 1;
+ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
}
- unlock_super(sb);
}
/*
@@ -478,8 +474,15 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
struct mb_cache_entry *ce = NULL;
+ int error = 0;
ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ error = ext3_journal_get_write_access(handle, bh);
+ if (error)
+ goto out;
+
+ lock_buffer(bh);
+
if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
ea_bdebug(bh, "refcount now=0; freeing");
if (ce)
@@ -488,21 +491,20 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
get_bh(bh);
ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
} else {
- if (ext3_journal_get_write_access(handle, bh) == 0) {
- lock_buffer(bh);
- BHDR(bh)->h_refcount = cpu_to_le32(
- le32_to_cpu(BHDR(bh)->h_refcount) - 1);
- ext3_journal_dirty_metadata(handle, bh);
- if (IS_SYNC(inode))
- handle->h_sync = 1;
- DQUOT_FREE_BLOCK(inode, 1);
- unlock_buffer(bh);
- ea_bdebug(bh, "refcount now=%d; releasing",
- le32_to_cpu(BHDR(bh)->h_refcount));
- }
+ le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+ error = ext3_journal_dirty_metadata(handle, bh);
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ dquot_free_block(inode, 1);
+ ea_bdebug(bh, "refcount now=%d; releasing",
+ le32_to_cpu(BHDR(bh)->h_refcount));
if (ce)
mb_cache_entry_release(ce);
}
+ unlock_buffer(bh);
+out:
+ ext3_std_error(inode->i_sb, error);
+ return;
}
struct ext3_xattr_info {
@@ -646,8 +648,8 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
if (ext3_xattr_check_block(bs->bh)) {
- ext3_error(sb, __FUNCTION__,
- "inode %ld: bad block %d", inode->i_ino,
+ ext3_error(sb, __func__,
+ "inode %lu: bad block "E3FSBLK, inode->i_ino,
EXT3_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
@@ -678,7 +680,7 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
struct buffer_head *new_bh = NULL;
struct ext3_xattr_search *s = &bs->s;
struct mb_cache_entry *ce = NULL;
- int error;
+ int error = 0;
#define header(x) ((struct ext3_xattr_header *)(x))
@@ -687,16 +689,17 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
if (s->base) {
ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
bs->bh->b_blocknr);
+ error = ext3_journal_get_write_access(handle, bs->bh);
+ if (error)
+ goto cleanup;
+ lock_buffer(bs->bh);
+
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
if (ce) {
mb_cache_entry_free(ce);
ce = NULL;
}
ea_bdebug(bs->bh, "modifying in-place");
- error = ext3_journal_get_write_access(handle, bs->bh);
- if (error)
- goto cleanup;
- lock_buffer(bs->bh);
error = ext3_xattr_set_entry(i, s);
if (!error) {
if (!IS_LAST_ENTRY(s->first))
@@ -716,12 +719,15 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
} else {
int offset = (char *)s->here - bs->bh->b_data;
+ unlock_buffer(bs->bh);
+ journal_release_buffer(handle, bs->bh);
+
if (ce) {
mb_cache_entry_release(ce);
ce = NULL;
}
ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+ s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
@@ -733,12 +739,11 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
}
} else {
/* Allocate a buffer where we construct the new block. */
- s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
+ s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
/* assert(header == s->base) */
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
- memset(s->base, 0, sb->s_blocksize);
header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
header(s->base)->h_blocks = cpu_to_le32(1);
header(s->base)->h_refcount = cpu_to_le32(1);
@@ -765,16 +770,15 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (DQUOT_ALLOC_BLOCK(inode, 1))
+ error = dquot_alloc_block(inode, 1);
+ if (error)
goto cleanup;
error = ext3_journal_get_write_access(handle,
new_bh);
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
- BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
- le32_to_cpu(BHDR(new_bh)->h_refcount));
+ le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
@@ -792,20 +796,27 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- int goal = le32_to_cpu(
- EXT3_SB(sb)->s_es->s_first_data_block) +
- EXT3_I(inode)->i_block_group *
- EXT3_BLOCKS_PER_GROUP(sb);
- int block = ext3_new_block(handle, inode, goal, &error);
+ ext3_fsblk_t goal = ext3_group_first_block_no(sb,
+ EXT3_I(inode)->i_block_group);
+ ext3_fsblk_t block;
+
+ /*
+ * Protect us agaist concurrent allocations to the
+ * same inode from ext3_..._writepage(). Reservation
+ * code does not expect racing allocations.
+ */
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ block = ext3_new_block(handle, inode, goal, &error);
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
if (error)
goto cleanup;
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
getblk_failed:
ext3_free_blocks(handle, inode, block, 1);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
@@ -842,12 +853,12 @@ cleanup:
return error;
cleanup_dquot:
- DQUOT_FREE_BLOCK(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
bad_block:
- ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: bad block %d", inode->i_ino,
+ ext3_error(inode->i_sb, __func__,
+ "inode %lu: bad block "E3FSBLK, inode->i_ino,
EXT3_I(inode)->i_file_acl);
goto cleanup;
@@ -874,7 +885,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
error = ext3_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -906,10 +917,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext3_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
- EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
+ ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
}
return 0;
}
@@ -917,7 +928,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
/*
* ext3_xattr_set_handle()
*
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode. Value
* is NULL to remove an existing extended attribute, and non-NULL to
* either replace an existing extended attribute, or create a new extended
* attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -955,10 +966,14 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
- if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
+ error = ext3_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
}
error = ext3_xattr_ibody_find(inode, &i, &is);
@@ -980,9 +995,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext3_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext3_xattr_ibody_set(handle, inode, &i, &is);
@@ -994,6 +1006,11 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
i.value = NULL;
error = ext3_xattr_block_set(handle, inode, &i, &bs);
} else if (error == -ENOSPC) {
+ if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
+ error = ext3_xattr_block_find(inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ }
error = ext3_xattr_block_set(handle, inode, &i, &bs);
if (error)
goto cleanup;
@@ -1075,15 +1092,15 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
goto cleanup;
bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
if (!bh) {
- ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: block %d read error", inode->i_ino,
+ ext3_error(inode->i_sb, __func__,
+ "inode %lu: block "E3FSBLK" read error", inode->i_ino,
EXT3_I(inode)->i_file_acl);
goto cleanup;
}
if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: bad block %d", inode->i_ino,
+ ext3_error(inode->i_sb, __func__,
+ "inode %lu: bad block "E3FSBLK, inode->i_ino,
EXT3_I(inode)->i_file_acl);
goto cleanup;
}
@@ -1120,12 +1137,12 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext3_xattr_cache);
+ ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
if (!ce) {
ea_bdebug(bh, "out of memory");
return;
}
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
if (error) {
mb_cache_entry_free(ce);
if (error == -EBUSY) {
@@ -1197,8 +1214,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
- inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+ hash);
while (ce) {
struct buffer_head *bh;
@@ -1209,12 +1226,12 @@ again:
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
- ext3_error(inode->i_sb, __FUNCTION__,
- "inode %ld: block %ld read error",
+ ext3_error(inode->i_sb, __func__,
+ "inode %lu: block %lu read error",
inode->i_ino, (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
EXT3_XATTR_REFCOUNT_MAX) {
- ea_idebug(inode, "block %ld refcount %d>=%d",
+ ea_idebug(inode, "block %lu refcount %d>=%d",
(unsigned long) ce->e_block,
le32_to_cpu(BHDR(bh)->h_refcount),
EXT3_XATTR_REFCOUNT_MAX);
@@ -1223,7 +1240,7 @@ again:
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
}
return NULL;
}
@@ -1299,9 +1316,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
int __init
init_ext3_xattr(void)
{
- ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
- sizeof(struct mb_cache_entry) +
- sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+ ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
if (!ext3_xattr_cache)
return -ENOMEM;
return 0;
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2ceae38f3d4..32e93ebf803 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -6,7 +6,6 @@
(C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/config.h>
#include <linux/xattr.h>
/* Magic value in attribute blocks */
@@ -59,16 +58,13 @@ struct ext3_xattr_entry {
# ifdef CONFIG_EXT3_FS_XATTR
-extern struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext3_xattr_list(struct inode *, char *, size_t);
extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
@@ -78,7 +74,7 @@ extern void ext3_xattr_put_super(struct super_block *);
extern int init_ext3_xattr(void);
extern void exit_ext3_xattr(void);
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
# else /* CONFIG_EXT3_FS_XATTR */
@@ -90,12 +86,6 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
}
static inline int
-ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
ext3_xattr_set(struct inode *inode, int name_index, const char *name,
const void *value, size_t size, int flags)
{
@@ -136,10 +126,10 @@ exit_ext3_xattr(void)
#ifdef CONFIG_EXT3_FS_SECURITY
extern int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir);
+ struct inode *dir, const struct qstr *qstr);
#else
static inline int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir)
+ struct inode *dir, const struct qstr *qstr)
{
return 0;
}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b9c40c15647..722c2bf9645 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,20 +3,15 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/security.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
-ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+ const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -29,47 +24,53 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext3_xattr_security_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext3_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name,
- buffer, size);
+ return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+ name, buffer, size);
}
static int
-ext3_xattr_security_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext3_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name,
- value, size, flags);
+ return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
}
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+static int ext3_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext3_xattr_set_handle(handle, inode,
+ EXT3_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
-struct xattr_handler ext3_xattr_security_handler = {
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext3_initxattrs, handle);
+}
+
+const struct xattr_handler ext3_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext3_xattr_security_list,
.get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 86d91f1186d..d75727cc67f 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,22 +5,14 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
-ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
@@ -35,26 +27,26 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext3_xattr_trusted_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
- buffer, size);
+ return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
}
static int
-ext3_xattr_trusted_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
+ return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext3_xattr_trusted_list,
.get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index a85a0a17c4f..5612af3567e 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,24 +5,17 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
-ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return 0;
if (list && total_len <= list_size) {
@@ -34,29 +27,30 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext3_xattr_user_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
+ return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+ name, buffer, size);
}
static int
-ext3_xattr_user_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext3_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
- value, size, flags);
+ return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+ name, value, size, flags);
}
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext3_xattr_user_list,
.get = ext3_xattr_user_get,