diff options
Diffstat (limited to 'fs/ext3')
| -rw-r--r-- | fs/ext3/Kconfig | 89 | ||||
| -rw-r--r-- | fs/ext3/acl.c | 404 | ||||
| -rw-r--r-- | fs/ext3/acl.h | 17 | ||||
| -rw-r--r-- | fs/ext3/balloc.c | 418 | ||||
| -rw-r--r-- | fs/ext3/bitmap.c | 16 | ||||
| -rw-r--r-- | fs/ext3/dir.c | 416 | ||||
| -rw-r--r-- | fs/ext3/ext3.h | 1326 | ||||
| -rw-r--r-- | fs/ext3/ext3_jbd.c | 14 | ||||
| -rw-r--r-- | fs/ext3/file.c | 89 | ||||
| -rw-r--r-- | fs/ext3/fsync.c | 69 | ||||
| -rw-r--r-- | fs/ext3/hash.c | 85 | ||||
| -rw-r--r-- | fs/ext3/ialloc.c | 177 | ||||
| -rw-r--r-- | fs/ext3/inode.c | 1127 | ||||
| -rw-r--r-- | fs/ext3/ioctl.c | 202 | ||||
| -rw-r--r-- | fs/ext3/namei.c | 624 | ||||
| -rw-r--r-- | fs/ext3/namei.h | 19 | ||||
| -rw-r--r-- | fs/ext3/resize.c | 215 | ||||
| -rw-r--r-- | fs/ext3/super.c | 1517 | ||||
| -rw-r--r-- | fs/ext3/symlink.c | 6 | ||||
| -rw-r--r-- | fs/ext3/xattr.c | 145 | ||||
| -rw-r--r-- | fs/ext3/xattr.h | 21 | ||||
| -rw-r--r-- | fs/ext3/xattr_security.c | 66 | ||||
| -rw-r--r-- | fs/ext3/xattr_trusted.c | 31 | ||||
| -rw-r--r-- | fs/ext3/xattr_user.c | 37 |
24 files changed, 4616 insertions, 2514 deletions
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig new file mode 100644 index 00000000000..e8c6ba0e4a3 --- /dev/null +++ b/fs/ext3/Kconfig @@ -0,0 +1,89 @@ +config EXT3_FS + tristate "Ext3 journalling file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at <http://sourceforge.net/projects/e2fsprogs/>). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3_DEFAULTS_TO_ORDERED + bool "Default to 'data=ordered' in ext3" + depends on EXT3_FS + default y + help + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index a754d184817..8bbaf5bcf98 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -4,13 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -54,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext3_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext3_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -97,14 +98,19 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); e = (char *)ext_acl + sizeof(ext3_acl_header); for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext3_acl_entry *entry = (ext3_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext3_acl_entry); + break; case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext3_acl_entry); break; @@ -126,65 +132,30 @@ fail: return ERR_PTR(-EINVAL); } -static inline struct posix_acl * -ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl) -{ - struct posix_acl *acl = EXT3_ACL_NOT_CACHED; - - spin_lock(&inode->i_lock); - if (*i_acl != EXT3_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); - - return acl; -} - -static inline void -ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl, - struct posix_acl *acl) -{ - spin_lock(&inode->i_lock); - if (*i_acl != EXT3_ACL_NOT_CACHED) - posix_acl_release(*i_acl); - *i_acl = posix_acl_dup(acl); - spin_unlock(&inode->i_lock); -} - /* * Inode operation get_posix_acl(). * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext3_get_acl(struct inode *inode, int type) { - struct ext3_inode_info *ei = EXT3_I(inode); int name_index; char *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - switch(type) { - case ACL_TYPE_ACCESS: - acl = ext3_iget_acl(inode, &ei->i_acl); - if (acl != EXT3_ACL_NOT_CACHED) - return acl; - name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - acl = ext3_iget_acl(inode, &ei->i_default_acl); - if (acl != EXT3_ACL_NOT_CACHED) - return acl; - name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - - default: - return ERR_PTR(-EINVAL); + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); } + retval = ext3_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); @@ -200,17 +171,9 @@ ext3_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); kfree(value); - if (!IS_ERR(acl)) { - switch(type) { - case ACL_TYPE_ACCESS: - ext3_iset_acl(inode, &ei->i_acl, acl); - break; + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); - case ACL_TYPE_DEFAULT: - ext3_iset_acl(inode, &ei->i_default_acl, acl); - break; - } - } return acl; } @@ -220,28 +183,23 @@ ext3_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext3_new_inode */ static int -ext3_set_acl(handle_t *handle, struct inode *inode, int type, +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { - struct ext3_inode_info *ei = EXT3_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; @@ -268,284 +226,56 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, value, size, 0); kfree(value); - if (!error) { - switch(type) { - case ACL_TYPE_ACCESS: - ext3_iset_acl(inode, &ei->i_acl, acl); - break; - case ACL_TYPE_DEFAULT: - ext3_iset_acl(inode, &ei->i_default_acl, acl); - break; - } - } - return error; -} - -static int -ext3_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - -int -ext3_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - return generic_permission(inode, mask, ext3_check_acl); -} + if (!error) + set_cached_acl(inode, type, acl); -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ -int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current->fs->umask; - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; - - if (S_ISDIR(inode->i_mode)) { - error = ext3_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } - } - posix_acl_release(clone); - } -cleanup: - posix_acl_release(acl); return error; } -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ int -ext3_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl, *clone; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - } -out: - posix_acl_release(clone); - return error; -} - -/* - * Extended attribute handlers - */ -static size_t -ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) -{ - struct posix_acl *acl; - int error; - - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext3_get_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext3_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext3_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext3_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; - struct posix_acl *acl; int error, retries = 0; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - retry: handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); - error = ext3_set_acl(handle, inode, type, acl); + error = __ext3_set_acl(handle, inode, type, acl); ext3_journal_stop(handle); if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; - -release_and_out: - posix_acl_release(acl); return error; } -static int -ext3_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext3_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +/* + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} + struct posix_acl *default_acl, *acl; + int error; -struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl_access, - .set = ext3_xattr_set_acl_access, -}; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl_default, - .set = ext3_xattr_set_acl_default, -}; + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } + return error; +} diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 0d1e6279cbf..ea1c69edab9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -53,24 +53,15 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL -/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl - if the ACL has not been cached */ -#define EXT3_ACL_NOT_CACHED ((void *)-1) - /* acl.c */ -extern int ext3_permission (struct inode *, int, struct nameidata *); -extern int ext3_acl_chmod (struct inode *); +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL - -static inline int -ext3_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext3_get_acl NULL +#define ext3_set_acl NULL static inline int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index da0cb2c0e43..158b5d4ce06 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -11,14 +11,9 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include "ext3.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -38,6 +33,21 @@ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/* + * Calculate the block group number and offset, given a block number + */ +static void ext3_get_group_no_and_offset(struct super_block *sb, + ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + if (offsetp) + *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); + if (blockgrpp) + *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); +} + /** * ext3_get_group_desc() -- load group descriptor from disk * @sb: super block @@ -117,7 +127,7 @@ static int ext3_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Invalid block bitmap - " "block_group = %d, block = %lu", block_group, bitmap_blk); @@ -144,10 +154,11 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); @@ -158,16 +169,17 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) if (bh_submit_read(bh) < 0) { brelse(bh); - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (!ext3_valid_block_bitmap(sb, desc, block_group, bh)) { - brelse(bh); - return NULL; - } + ext3_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ return bh; } /* @@ -232,11 +244,10 @@ restart: prev = rsv; } printk("Window map complete.\n"); - if (bad) - BUG(); + BUG_ON(bad); } #define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __FUNCTION__) + __rsv_window_dump((root), (verbose), __func__) #else #define rsv_window_dump(root, verbose) do {} while (0) #endif @@ -334,6 +345,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -407,7 +419,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); @@ -459,8 +471,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -469,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode) * ext3_free_blocks_sb() -- Free given blocks and update quota * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physical block to free * @count: number of blocks to free * @pdquot_freed_blocks: pointer to quota */ @@ -573,7 +587,7 @@ do_more: BUFFER_TRACE(debug_bh, "Deleted!"); if (!bh2jh(bitmap_bh)->b_committed_data) BUFFER_TRACE(debug_bh, - "No commited data in bitmap"); + "No committed data in bitmap"); BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); __brelse(debug_bh); } @@ -618,7 +632,7 @@ do_more: if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { jbd_unlock_bh_state(bitmap_bh); - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "bit already cleared for block "E3FSBLK, block + i); jbd_lock_bh_state(bitmap_bh); @@ -649,7 +663,7 @@ do_more: count = overflow; goto do_more; } - sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext3_std_error(sb, err); @@ -666,17 +680,13 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) - DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + dquot_free_block(inode, dquot_freed_blocks); return; } @@ -791,9 +801,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, if (here < 0) here = 0; - p = ((char *)bh->b_data) + (here >> 3); + p = bh->b_data + (here >> 3); r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - ((char *)bh->b_data)) << 3; + next = (r - bh->b_data) << 3; if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) return next; @@ -809,8 +819,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, /** * claim_block() + * @lock: the spin lock for this block group * @block: the free block (group relative) to allocate - * @bh: the bufferhead containts the block group bitmap + * @bh: the buffer_head contains the block group bitmap * * We think we can allocate this block in this bitmap. Try to set the bit. * If that succeeds then check that nobody has allocated and then freed the @@ -955,9 +966,11 @@ fail_access: * but we will shift to the place where start_block is, * then start from there, when looking for a reservable space. * - * @size: the target new reservation window size + * @my_rsv: the reservation window + * + * @sb: the super block * - * @group_first_block: the first block we consider to start + * @start_block: the first block we consider to start * the real search from * * @last_block: @@ -1043,7 +1056,7 @@ static int find_next_reservable_window( rsv_window_remove(sb, my_rsv); /* - * Let's book the whole avaliable window for now. We will check the + * Let's book the whole available window for now. We will check the * disk bitmap later and then, if there are free blocks then we adjust * the window size if it's larger than requested. * Otherwise, we will remove this node from the tree next time @@ -1083,7 +1096,7 @@ static int find_next_reservable_window( * * failed: we failed to find a reservation window in this group * - * @rsv: the reservation + * @my_rsv: the reservation window * * @grp_goal: The goal (group-relative). It is where the search for a * free reservable space should start from. @@ -1116,6 +1129,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1210,8 +1224,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1272,8 +1289,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, * @group: given allocation block group * @bitmap_bh: bufferhead holds the block bitmap * @grp_goal: given target block within the group - * @count: target number of blocks to allocate * @my_rsv: reservation window + * @count: target number of blocks to allocate * @errp: pointer to store the error code * * This is the main function used to allocate a new block and its reservation @@ -1415,15 +1432,16 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; @@ -1436,14 +1454,14 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) * * ext3_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait - * for the current or commiting transaction to complete, and then + * for the current or committing transaction to complete, and then * return TRUE. * * if the total number of retries exceed three times, return FALSE. */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1494,21 +1512,20 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. */ - if (DQUOT_ALLOC_BLOCK(inode, num)) { - *errp = -EDQUOT; + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1522,7 +1539,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } @@ -1547,6 +1564,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1581,11 +1599,17 @@ retry_alloc: goto io_error; free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* * skip this group if the number of * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); @@ -1604,9 +1628,9 @@ retry_alloc: goto allocated; } /* - * We may end up a bogus ealier ENOSPC error due to + * We may end up a bogus earlier ENOSPC error due to * filesystem is "full" of reservations, but - * there maybe indeed free blocks avaliable on disk + * there maybe indeed free blocks available on disk * In this case, we just forget about the reservations * just do block allocation as without reservations. */ @@ -1642,7 +1666,11 @@ allocated: "Allocating block in system zone - " "blocks from "E3FSBLK", length %lu", ret_block, num); - goto out; + /* + * claim_block() marked the blocks we allocated as in use. So we + * may want to selectively mark some of the blocks as free. + */ + goto retry_alloc; } performed_allocation = 1; @@ -1668,7 +1696,7 @@ allocated: if (ext3_test_bit(grp_alloc_blk+i, bh2jh(bitmap_bh)->b_committed_data)) { printk("%s: block was unexpectedly set in " - "b_committed_data\n", __FUNCTION__); + "b_committed_data\n", __func__); } } } @@ -1699,18 +1727,21 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - - sb->s_dirt = 1; + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); if (fatal) goto out; *errp = 0; brelse(bitmap_bh); - DQUOT_FREE_BLOCK(inode, *count-num); - *count = num; + + if (num < *count) { + dquot_free_block(inode, *count-num); + *count = num; + } + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1724,7 +1755,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - DQUOT_FREE_BLOCK(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } @@ -1779,7 +1810,7 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) brelse(bitmap_bh); printk("ext3_count_free_blocks: stored = "E3FSBLK ", computed = "E3FSBLK", "E3FSBLK"\n", - le32_to_cpu(es->s_free_blocks_count), + (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); return bitmap_count; #else @@ -1870,3 +1901,258 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) return ext3_bg_num_gdb_meta(sb,group); } + +/** + * ext3_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: allocation group to trim + * @start: first group block to examine + * @max: last group block to examine + * @gdp: allocation group description structure + * @minblocks: minimum extent block count + * + * ext3_trim_all_free walks through group's block bitmap searching for free + * blocks. When the free block is found, it tries to allocate this block and + * consequent free block to get the biggest free extent possible, until it + * reaches any used block. Then issue a TRIM command on this extent and free + * the extent in the block bitmap. This is done until whole group is scanned. + */ +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) +{ + handle_t *handle; + ext3_grpblk_t next, free_blocks, bit, freed, count = 0; + ext3_fsblk_t discard_block; + struct ext3_sb_info *sbi; + struct buffer_head *gdp_bh, *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int err = 0, ret = 0; + + /* + * We will update one block bitmap, and one group descriptor + */ + handle = ext3_journal_start_sb(sb, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bitmap_bh = read_block_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto err_out; + + gdp = ext3_get_group_desc(sb, group, &gdp_bh); + if (!gdp) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto err_out; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + sbi = EXT3_SB(sb); + + /* Walk through the whole group */ + while (start <= max) { + start = bitmap_search_next_usable_block(start, bitmap_bh, max); + if (start < 0) + break; + next = start; + + /* + * Allocate contiguous free extents by setting bits in the + * block bitmap + */ + while (next <= max + && claim_block(sb_bgl_lock(sbi, group), + next, bitmap_bh)) { + next++; + } + + /* We did not claim any blocks */ + if (next == start) + continue; + + discard_block = (ext3_fsblk_t)start + + ext3_group_first_block_no(sb, group); + + /* Update counters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, start - next); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + + free_blocks -= next - start; + /* Do not issue a TRIM on extents smaller than minblocks */ + if ((next - start) < minblocks) + goto free_extent; + + trace_ext3_discard_blocks(sb, discard_block, next - start); + /* Send the TRIM command down to the device */ + err = sb_issue_discard(sb, discard_block, next - start, + GFP_NOFS, 0); + count += (next - start); +free_extent: + freed = 0; + + /* + * Clear bits in the bitmap + */ + for (bit = start; bit < next; bit++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), + bit, bitmap_bh->b_data)) { + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + (unsigned long)bit); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + freed++; + } + } + + /* Update couters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, freed); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + + start = next; + if (err < 0) { + if (err != -EOPNOTSUPP) + ext3_warning(sb, __func__, "Discard command " + "returned error %d\n", err); + break; + } + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + cond_resched(); + + /* No more suitable extents */ + if (free_blocks < minblocks) + break; + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + ret = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = ret; + + /* And the group descriptor block */ + BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!err) + err = ret; + + ext3_debug("trimmed %d blocks in the group %d\n", + count, group); + +err_out: + if (err) + count = err; + ext3_journal_stop(handle); + brelse(bitmap_bh); + + return count; +} + +/** + * ext3_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @start: First Byte to trim + * @len: number of Bytes to trim from start + * @minlen: minimum extent length in Bytes + * + * ext3_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext3_trim_all_free function + * is invoked to trim all free space. + */ +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + ext3_grpblk_t last_block, first_block; + unsigned long group, first_group, last_group; + struct ext3_group_desc *gdp; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + uint64_t start, minlen, end, trimmed = 0; + ext3_fsblk_t first_data_blk = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); + ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + smp_rmb(); + + /* Determine first and last group to examine based on start and len */ + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, + &first_group, &first_block); + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, + &last_group, &last_block); + + /* end now represents the last block to discard in this group */ + end = EXT3_BLOCKS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + gdp = ext3_get_group_desc(sb, group, NULL); + if (!gdp) + break; + + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_block is + * already computed earlier by ext3_get_group_no_and_offset() + */ + if (group == last_group) + end = last_block; + + if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { + ret = ext3_trim_all_free(sb, group, first_block, + end, minlen); + if (ret < 0) + break; + trimmed += ret; + } + + /* + * For every group except the first one, we are sure + * that the first block to discard will be block #0. + */ + first_block = 0; + } + + if (ret > 0) + ret = 0; + +out: + range->len = trimmed * sb->s_blocksize; + return ret; +} diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index 6afc39d8025..ef9c643e8e9 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -7,25 +7,13 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/buffer_head.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #ifdef EXT3FS_DEBUG -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; - - if (!map) - return (0); - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); } #endif /* EXT3FS_DEBUG */ diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 8ca3bfd7242..17742eed2c1 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,35 +21,14 @@ * */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/rbtree.h> +#include <linux/compat.h> +#include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .ioctl = ext3_ioctl, /* BKL held */ -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - +static int ext3_dx_readdir(struct file *, struct dir_context *); static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -60,6 +39,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -69,77 +67,67 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, const char * error_msg = NULL; const int rlen = ext3_rec_len_from_disk(de->rec_len); - if (rlen < EXT3_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; - if (error_msg != NULL) + if (unlikely(error_msg != NULL)) ext3_error (dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, (unsigned long) le32_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; } -static int ext3_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned long offset; - int i, stored; + int i; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; - struct inode *inode = filp->f_path.dentry->d_inode; - int ret = 0; - - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { - err = ext3_dx_readdir(filp, dirent, filldir); - if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; - } + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + int dir_has_error = 0; + + if (is_dx_dir(inode)) { + err = ext3_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { - unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + while (ctx->pos < inode->i_size) { + unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); struct buffer_head map_bh; struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext3_bread(NULL, inode, blk, 0, &err); } @@ -148,22 +136,24 @@ static int ext3_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext3_error (sb, "ext3_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, ctx->pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (offset && file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext3_dir_entry_2 *) (bh->b_data + i); @@ -179,71 +169,124 @@ revalidate: i += ext3_rec_len_from_disk(de->rec_len); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); if (!ext3_check_dir_entry ("ext3_readdir", inode, de, bh, offset)) { - /* On error, skip the f_pos to the + /* On error, skip the to the next block. */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); - ret = stored; - goto out; + break; } offset += ext3_rec_len_from_disk(de->rec_len); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored ++; + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext3_rec_len_from_disk(de->rec_len); + ctx->pos += ext3_rec_len_from_disk(de->rec_len); } offset = 0; brelse (bh); + if (ctx->pos < inode->i_size) + if (!dir_relax(inode)) + return 0; } -out: - return ret; + return 0; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif } /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext3_get_htree_eof(file); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + return generic_file_llseek(file, offset, whence); +} /* * This structure holds the nodes of the red-black tree used to store @@ -266,59 +309,28 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if ((n)->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname * old = fname; + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; fname = fname->next; - kfree (old); - } - if (!parent) - root->rb_node = NULL; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } - root->rb_node = NULL; -} + kfree(old); + } while (fname); + *root = RB_ROOT; +} -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -393,73 +405,66 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, - filldir_t filldir, struct fname *fname) +static bool call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block * sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { printk("call_filldir: called with null fname?!?\n"); - return 0; + return true; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; - info->extra_fname = fname->next; - return error; + get_dtype(sb, fname->file_type))) { + info->extra_fname = fname; + return false; } fname = fname->next; } - return 0; + return true; } -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (ctx->pos == ext3_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; - - if (!info->curr_node) + if (info->extra_fname) { + if (!call_filldir(file, ctx, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { @@ -469,17 +474,17 @@ static int ext3_dx_readdir(struct file * filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext3_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext3_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -488,13 +493,18 @@ static int ext3_dx_readdir(struct file * filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (!call_filldir(file, ctx, fname)) break; - + next_node: info->curr_node = rb_next(info->curr_node); - if (!info->curr_node) { + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -502,7 +512,7 @@ static int ext3_dx_readdir(struct file * filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -513,3 +523,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .iterate = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h new file mode 100644 index 00000000000..e85ff15a060 --- /dev/null +++ b/fs/ext3/ext3.h @@ -0,0 +1,1326 @@ +/* + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/magic.h> +#include <linux/bug.h> +#include <linux/blockgroup_lock.h> + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3FS_DEBUG to produce debug messages + */ +#undef EXT3FS_DEBUG + +/* + * Define EXT3_RESERVATION to reserve data blocks for expanding files + */ +#define EXT3_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT3_MAX_RESERVE_BLOCKS 1027 +#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT3FS_DEBUG +#define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3_BAD_INO 1 /* Bad blocks inode */ +#define EXT3_ROOT_INO 2 /* Root inode */ +#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3 filesystems */ +#define EXT3_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT3_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3_MIN_BLOCK_SIZE 1024 +#define EXT3_MAX_BLOCK_SIZE 65536 +#define EXT3_MIN_BLOCK_LOG_SIZE 10 +#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) +#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) +#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3_MIN_FRAG_SIZE 1024 +#define EXT3_MAX_FRAG_SIZE 4096 +#define EXT3_MIN_FRAG_LOG_SIZE 10 +#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) +#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext3_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) +#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) +#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) +#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT3_NDIR_BLOCKS 12 +#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS +#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) +#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) +#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3_DIRTY_FL 0x00000100 +#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ + EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ + EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT3_REG_FLMASK; + else + return flags & EXT3_OTHER_FLMASK; +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext3_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ +struct ext3_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + +/* + * ioctl commands + */ +#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT3_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) +#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext3_mount_options { + unsigned long s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext3_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +/* EXT3_MOUNT_OLDALLOC was there */ +#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ + +/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3_MOUNT_##opt +#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS +#endif + +#define ext3_set_bit __set_bit_le +#define ext3_set_bit_atomic ext2_set_bit_atomic +#define ext3_clear_bit __clear_bit_le +#define ext3_clear_bit_atomic ext2_clear_bit_atomic +#define ext3_test_bit test_bit_le +#define ext3_find_next_zero_bit find_next_zero_bit_le + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3_ERRORS_PANIC 3 /* Panic */ +#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ +}; + +/* data type for block offset of block group */ +typedef int ext3_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext3_fsblk_t; + +#define E3FSBLK "%lu" + +struct ext3_reserve_window { + ext3_fsblk_t _rsv_start; /* First byte reserved */ + ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext3_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext3_reserve_window rsv_window; +}; + +struct ext3_block_alloc_info { + /* information about reservation window */ + struct ext3_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext3_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext3_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext3_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * third extended file system inode data in memory + */ +struct ext3_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; +#ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; +#endif + ext3_fsblk_t i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + unsigned long i_state_flags; /* Dynamic state flags for ext3 */ + + /* block reservation info */ + struct ext3_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT3_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * truncate_mutex is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_mutex. + */ + struct mutex truncate_mutex; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + atomic_t i_sync_tid; + atomic_t i_datasync_tid; + + struct inode vfs_inode; +}; + +/* + * third extended-fs super-block data in memory + */ +struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + ext3_fsblk_t s_sb_block; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext3_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext3_inode_info *EXT3_I(struct inode *inode) +{ + return container_of(inode, struct ext3_inode_info, vfs_inode); +} + +static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3_ROOT_INO || + ino == EXT3_JOURNAL_INO || + ino == EXT3_RESIZE_INO || + (ino >= EXT3_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT3_OS_LINUX 0 +#define EXT3_OS_HURD 1 +#define EXT3_OS_MASIX 2 +#define EXT3_OS_FREEBSD 3 +#define EXT3_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV +#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + +#define EXT3_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 + +#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3_DEF_RESUID 0 +#define EXT3_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT3_DEFM_DEBUG 0x0001 +#define EXT3_DEFM_BSDGROUPS 0x0002 +#define EXT3_DEFM_XATTR_USER 0x0004 +#define EXT3_DEFM_ACL 0x0008 +#define EXT3_DEFM_UID16 0x0010 +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT3_NAME_LEN 255 + +struct ext3_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3_FT_UNKNOWN 0 +#define EXT3_FT_REG_FILE 1 +#define EXT3_FT_DIR 2 +#define EXT3_FT_CHRDEV 3 +#define EXT3_FT_BLKDEV 4 +#define EXT3_FT_FIFO 5 +#define EXT3_FT_SOCK 6 +#define EXT3_FT_SYMLINK 7 + +#define EXT3_FT_MAX 8 + +/* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3_DIR_PAD 4 +#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) +#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) +#define EXT3_MAX_REC_LEN ((1<<16)-1) + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext3_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT3_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext3_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT3_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); +#endif + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext3_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3_iloc +{ + struct buffer_head *bh; + unsigned long offset; + unsigned long block_group; +}; + +static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) +{ + return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext3_fsblk_t +ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext3_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); +extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); +extern void ext3_free_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count); +extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); +extern void ext3_check_blocks_bitmap (struct super_block *); +extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext3_init_block_alloc_info(struct inode *); +extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); +extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); + +/* dir.c */ +extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +extern void ext3_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext3fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext3_new_inode (handle_t *, struct inode *, + const struct qstr *, umode_t); +extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext3_count_free_inodes (struct super_block *); +extern unsigned long ext3_count_dirs (struct super_block *); +extern void ext3_check_inodes_bitmap (struct super_block *); +extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + +/* inode.c */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); +struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, + int create); + +extern struct inode *ext3_iget(struct super_block *, unsigned long); +extern int ext3_write_inode (struct inode *, struct writeback_control *); +extern int ext3_setattr (struct dentry *, struct iattr *); +extern void ext3_evict_inode (struct inode *); +extern int ext3_sync_inode (handle_t *, struct inode *); +extern void ext3_discard_reservation (struct inode *); +extern void ext3_dirty_inode(struct inode *, int); +extern int ext3_change_inode_journal_flag(struct inode *, int); +extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); +extern int ext3_can_truncate(struct inode *inode); +extern void ext3_truncate(struct inode *inode); +extern void ext3_set_inode_flags(struct inode *); +extern void ext3_get_inode_flags(struct ext3_inode_info *); +extern void ext3_set_aops(struct inode *inode); +extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +extern int ext3_orphan_add(handle_t *, struct inode *); +extern int ext3_orphan_del(handle_t *, struct inode *); +extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext3_group_add(struct super_block *sb, + struct ext3_new_group_data *input); +extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count); + +/* super.c */ +extern __printf(3, 4) +void ext3_error(struct super_block *, const char *, const char *, ...); +extern void __ext3_std_error (struct super_block *, const char *, int); +extern __printf(3, 4) +void ext3_abort(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_msg(struct super_block *, const char *, const char *, ...); +extern void ext3_update_dynamic_rev (struct super_block *sb); + +#define ext3_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3_std_error((sb), __func__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext3_dir_operations; + +/* file.c */ +extern const struct inode_operations ext3_file_inode_operations; +extern const struct file_operations ext3_file_operations; + +/* namei.c */ +extern const struct inode_operations ext3_dir_inode_operations; +extern const struct inode_operations ext3_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext3_symlink_inode_operations; +extern const struct inode_operations ext3_fast_symlink_inode_operations; + +#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT3_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ + EXT3_XATTR_TRANS_BLOCKS - 2 + \ + EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT3_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT3_RESERVE_TRANS_BLOCKS 12U + +#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) + +int +ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext3_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh); + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__func__, (handle), (bh)) +#define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__func__, (handle), (bh)) +#define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__func__, (handle), (bh)) +#define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__func__, (handle), (bh)) +#define ext3_journal_forget(handle, bh) \ + __ext3_journal_forget(__func__, (handle), (bh)) + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); +int __ext3_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) +{ + return ext3_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext3_journal_stop(handle) \ + __ext3_journal_stop(__func__, (handle)) + +static inline handle_t *ext3_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext3_journal_extend(handle_t *handle, int nblocks) +{ + return journal_extend(handle, nblocks); +} + +static inline int ext3_journal_restart(handle_t *handle, int nblocks) +{ + return journal_restart(handle, nblocks); +} + +static inline int ext3_journal_blocks_per_page(struct inode *inode) +{ + return journal_blocks_per_page(inode); +} + +static inline int ext3_journal_force_commit(journal_t *journal) +{ + return journal_force_commit(journal); +} + +/* super.c */ +int ext3_force_commit(struct super_block *sb); + +static inline int ext3_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext3_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext3_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#include <trace/events/ext3.h> diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c index e1f91fd26a9..785a3261a26 100644 --- a/fs/ext3/ext3_jbd.c +++ b/fs/ext3/ext3_jbd.c @@ -2,14 +2,14 @@ * Interface between ext3 and JBD */ -#include <linux/ext3_jbd.h> +#include "ext3.h" int __ext3_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { int err = journal_get_undo_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -18,7 +18,7 @@ int __ext3_journal_get_write_access(const char *where, handle_t *handle, { int err = journal_get_write_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -27,7 +27,7 @@ int __ext3_journal_forget(const char *where, handle_t *handle, { int err = journal_forget(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -36,7 +36,7 @@ int __ext3_journal_revoke(const char *where, handle_t *handle, { int err = journal_revoke(handle, blocknr, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -45,7 +45,7 @@ int __ext3_journal_get_create_access(const char *where, { int err = journal_get_create_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -54,6 +54,6 @@ int __ext3_journal_dirty_metadata(const char *where, { int err = journal_dirty_metadata(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index acc4913d301..a062fa1e1b1 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -18,11 +18,8 @@ * (jj@sunsite.ms.mff.cuni.cz) */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> +#include <linux/quotaops.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -33,6 +30,10 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { + filemap_flush(inode->i_mapping); + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); + } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) @@ -47,85 +48,25 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, - .ioctl = ext3_ioctl, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, @@ -133,6 +74,8 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, + .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index a588e23841d..1cb9c7e10c6 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -22,13 +22,9 @@ * we can depend on generic_block_fdatasync() to sync the data blocks. */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/sched.h> +#include <linux/blkdev.h> #include <linux/writeback.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> +#include "ext3.h" /* * akpm: A new design for ext3_sync_file(). @@ -42,22 +38,34 @@ * inode to disk. */ -int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) +int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = dentry->d_inode; - int ret = 0; + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + int ret, needs_barrier = 0; + tid_t commit_tid; + + trace_ext3_sync_file_enter(file, datasync); + + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated state */ + smp_rmb(); + if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) + return -EROFS; + return 0; + } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + goto out; J_ASSERT(ext3_journal_current_handle() == NULL); /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -72,17 +80,30 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); + + if (test_opt(inode->i_sb, BARRIER) && + !journal_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = 1; + log_start_commit(journal, commit_tid); + ret = log_wait_commit(journal, commit_tid); + /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. + * In case we didn't commit a transaction, we have to flush + * disk caches manually so that data really is on persistent + * storage */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - ret = sync_inode(inode, &wbc); + if (needs_barrier) { + int err; + + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; } out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2..ede315cdf12 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -9,9 +9,7 @@ * License. */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/cryptohash.h> #define DELTA 0x9E3779B9 @@ -35,23 +33,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +108,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i=0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +141,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +161,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +180,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; @@ -143,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 4f4020c5468..a1b810230cc 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -12,20 +12,10 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/random.h> -#include <linux/bitops.h> - -#include <asm/byteorder.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -118,21 +108,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); - - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ - DQUOT_INIT(inode); - ext3_xattr_delete_inode(handle, inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); - /* Do this BEFORE marking the inode not in use or returning an error */ - clear_inode (inode); - es = EXT3_SB(sb)->s_es; if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext3_error (sb, "ext3_free_inode", @@ -181,49 +160,13 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) err = ext3_journal_dirty_metadata(handle, bitmap_bh); if (!fatal) fatal = err; - sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext3_std_error(sb, fatal); } /* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - -/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -237,9 +180,8 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * it has too few free blocks left (min_blocks). + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). @@ -248,21 +190,16 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent) { int parent_group = EXT3_I(parent)->i_block_group; struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; - ext3_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; @@ -278,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; @@ -299,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; - max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; - for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext3_get_group_desc (sb, group, NULL); @@ -416,7 +343,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, umode_t mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -437,6 +365,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -444,12 +373,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; @@ -537,18 +463,14 @@ got: percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - sb->s_dirt = 1; - inode->i_uid = current->fsuid; - if (test_opt (sb, GRPID)) - inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current->fsgid; - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ @@ -559,12 +481,8 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT3_DIRSYNC_FL; + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); #ifdef EXT3_FRAGMENTS ei->i_faddr = 0; ei->i_frag_no = 0; @@ -579,27 +497,41 @@ got: ext3_set_inode_flags(inode); if (IS_DIRSYNC(inode)) handle->h_sync = 1; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; + } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state = EXT3_STATE_NEW; - ei->i_extra_isize = - (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? - sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; + ei->i_state_flags = 0; + ext3_set_inode_state(inode, EXT3_STATE_NEW); + + /* See comment in ext3_iget for explanation */ + if (ino >= EXT3_FIRST_INO(sb) + 1 && + EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = + sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; + } else { + ei->i_extra_isize = 0; + } ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext3_init_acl(handle, inode, dir); if (err) goto fail_free_drop; - err = ext3_init_security(handle,inode, dir); + err = ext3_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; @@ -610,6 +542,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); @@ -621,12 +554,13 @@ really_out: return ret; fail_free_drop: - DQUOT_FREE_INODE(inode); + dquot_free_inode(inode); fail_drop: - DQUOT_DROP(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); + unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); return ERR_PTR(err); @@ -644,7 +578,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -653,7 +587,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); bitmap_bh = read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "inode bitmap error for orphan %lu", ino); goto error; } @@ -669,6 +603,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) if (IS_ERR(inode)) goto iget_failed; + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext3_can_truncate(inode)) + goto bad_orphan; + if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); @@ -678,7 +620,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, @@ -690,6 +632,7 @@ bad_orphan: printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index eb95670a27e..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,24 +22,18 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h> #include <linux/highuid.h> -#include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/mpage.h> -#include <linux/uio.h> -#include <linux/bio.h> +#include <linux/namei.h> +#include <linux/aio.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -68,6 +62,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -95,7 +90,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, BUFFER_TRACE(bh, "call ext3_journal_revoke"); err = ext3_journal_revoke(handle, blocknr, bh); if (err) - ext3_abort(inode->i_sb, __FUNCTION__, + ext3_abort(inode->i_sb, __func__, "error %d when attempting revoke", err); BUFFER_TRACE(bh, "exit"); return err; @@ -170,22 +165,78 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) { + int ret; + jbd_debug(2, "restarting handle %p\n", handle); - return ext3_journal_restart(handle, blocks_for_truncate(inode)); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; } /* - * Called at the last iput() if i_nlink is zero. + * Called at inode eviction from icache */ -void ext3_delete_inode (struct inode * inode) +void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *rsv; handle_t *handle; + int want_delete = 0; + + trace_ext3_evict_inode(inode); + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + want_delete = 1; + } + + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages_final(&inode->i_data); - truncate_inode_pages(&inode->i_data, 0); + ext3_discard_reservation(inode); + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); - if (is_bad_inode(inode)) + if (!want_delete) goto no_delete; handle = start_transaction(inode); @@ -205,15 +256,13 @@ void ext3_delete_inode (struct inode * inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -222,15 +271,22 @@ void ext3_delete_inode (struct inode * inode) * having errors), but we can't free the inode if the mark_dirty * fails. */ - if (ext3_mark_inode_dirty(handle, inode)) - /* If that failed, just do the required in-core inode clear. */ + if (ext3_mark_inode_dirty(handle, inode)) { + /* If that failed, just dquot_drop() and be done with that */ + dquot_drop(inode); + clear_inode(inode); + } else { + ext3_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode(inode); - else ext3_free_inode(handle, inode); + } ext3_journal_stop(handle); return; no_delete: - clear_inode(inode); /* We must guarantee clearing of inode... */ + clear_inode(inode); + dquot_drop(inode); } typedef struct { @@ -392,7 +448,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -436,12 +492,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) } /** - * ext3_find_goal - find a prefered place for allocation. + * ext3_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * - * Normally this function find the prefered place for block allocation, + * Normally this function find the preferred place for block allocation, * returns it. */ @@ -465,7 +521,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, } /** - * ext3_blks_to_allocate: Look up the block map and count the number + * ext3_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks @@ -503,14 +559,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, } /** - * ext3_alloc_blocks: multiple allocate blocks needed for a branch + * ext3_alloc_blocks - multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: owner + * @goal: preferred place for allocation * @indirect_blks: the number of blocks need to allocate for indirect * blocks - * + * @blks: number of blocks need to allocated for direct blocks * @new_blocks: on return it will store the new block numbers for * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks + * @err: here we store the error value + * + * return the number of direct blocks allocated */ static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t goal, int indirect_blks, int blks, @@ -565,9 +625,11 @@ failed_out: /** * ext3_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * @@ -616,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -657,7 +723,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext3_free_blocks(handle, inode, new_blocks[i], 1); ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -667,10 +733,9 @@ failed: /** * ext3_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext3_alloc_branch) * @where: location of missing link * @num: number of indirect blocks we are adding * @blks: number of direct blocks we are adding @@ -686,8 +751,10 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, int err = 0; struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; - block_i = EXT3_I(inode)->i_block_alloc_info; + block_i = ei->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -725,9 +792,13 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); /* had we spliced it onto indirect block? */ if (where->bh) { @@ -786,7 +857,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -801,6 +872,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -818,7 +890,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, while (count < maxblocks && count <= blocks_to_boundary) { ext3_fsblk_t blk; - if (!verify_chain(chain, partial)) { + if (!verify_chain(chain, chain + depth - 1)) { /* * Indirect block might be removed by * truncate while we were reading it. @@ -845,6 +917,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -893,9 +968,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -909,13 +981,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -936,6 +1001,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -961,7 +1029,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -970,7 +1038,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -981,6 +1049,13 @@ out: return ret; } +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); +} + /* * `handle' can be NULL if create is zero */ @@ -996,22 +1071,21 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. */ if (err > 0) { - if (err > 1) - WARN_ON(1); + WARN_ON(err > 1); err = 0; } *errp = err; if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { @@ -1059,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; - if (buffer_uptodate(bh)) + if (bh_uptodate_or_lock(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1130,9 +1206,45 @@ static int walk_page_buffers( handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext3_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; +} + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); } static int ext3_write_begin(struct file *file, struct address_space *mapping, @@ -1140,19 +1252,24 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + int ret; handle_t *handle; int retries = 0; struct page *page; pgoff_t index; unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + + trace_ext3_write_begin(inode, pos, len, flags); index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; retry: - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; @@ -1164,8 +1281,7 @@ retry: ret = PTR_ERR(handle); goto out; } - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext3_get_block); + ret = __block_write_begin(page, pos, len, ext3_get_block); if (ret) goto write_begin_failed; @@ -1175,9 +1291,22 @@ retry: } write_begin_failed: if (ret) { + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1190,11 +1319,23 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = journal_dirty_data(handle, bh); if (err) - ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, + ext3_journal_abort_handle(__func__, __func__, bh, handle, err); return err; } +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1205,26 +1346,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext3 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run - * after block_write_end. + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. */ -static int ext3_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) { - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; mark_inode_dirty(inode); } - - return copied; } /* @@ -1244,34 +1379,30 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; + trace_ext3_ordered_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext3_journal_dirty_data); - - if (ret == 0) { - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - loff_t new_i_size; + from, to, NULL, journal_dirty_data_fn); - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - copied = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - if (copied < 0) - ret = copied; - } + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1282,24 +1413,23 @@ static int ext3_writeback_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = file->f_mapping->host; - int ret = 0, ret2; - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - - copied = ext3_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - if (copied < 0) - ret = copied; + int ret; - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; + trace_ext3_writeback_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1310,28 +1440,39 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; if (copied < len) { if (!PageUptodate(page)) copied = 0; - page_zero_new_buffers(page, from+copied, to); + page_zero_new_buffers(page, from + copied, to); + to = from + copied; } ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1343,6 +1484,8 @@ static int ext3_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1366,7 +1509,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -1385,7 +1528,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1410,64 +1553,23 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) { - if (buffer_mapped(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; + return !buffer_mapped(bh); } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1479,6 +1581,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1487,6 +1596,20 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1494,11 +1617,6 @@ static int ext3_ordered_writepage(struct page *page, goto out_fail; } - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - page_bufs = page_buffers(page); walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bget_one); @@ -1516,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1543,19 +1658,35 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_fail; } - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_writepage(page, ext3_get_block, wbc); - else - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, ext3_get_block, wbc); err = ext3_journal_stop(handle); if (!ret) @@ -1576,23 +1707,33 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; - if (ext3_journal_current_handle()) - goto no_write; - - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + trace_ext3_journalled_writepage(page); if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); if (ret != 0) { ext3_journal_stop(handle); goto out_unlock; @@ -1604,19 +1745,22 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1629,6 +1773,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1639,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset, length); + /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1672,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1681,7 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); + int retries = 0; + + trace_ext3_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE) { loff_t final_size = offset + count; @@ -1704,9 +1855,21 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } } - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext3_get_block, NULL); +retry: + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if (end > isize) + ext3_truncate_failed_direct_write(inode); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; @@ -1715,9 +1878,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle = ext3_journal_start(inode, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -1742,6 +1908,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -1765,44 +1932,48 @@ static int ext3_journalled_set_page_dirty(struct page *page) } static const struct address_space_operations ext3_ordered_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_ordered_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_ordered_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_writeback_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_writeback_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_writeback_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_journalled_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_journalled_writepage, - .sync_page = block_sync_page, - .write_begin = ext3_write_begin, - .write_end = ext3_journalled_write_end, - .set_page_dirty = ext3_journalled_set_page_dirty, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void ext3_set_aops(struct inode *inode) @@ -1821,31 +1992,27 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -1878,20 +2045,30 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err) + goto unlock; + } + + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); goto unlock; + } } if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -1905,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -1937,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q) * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is refered + * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation @@ -1967,7 +2147,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth, int k, err; *top = 0; - /* Make k index the deepest non-null offest + 1 */ + /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext3_get_branch(inode, k, offsets, chain, &err); @@ -2024,13 +2204,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + if (ext3_journal_dirty_metadata(handle, bh)) + return; } ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext3_journal_get_write_access(handle, bh); + if (ext3_journal_get_write_access(handle, bh)) + return; } } @@ -2064,7 +2246,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, * @first: array of block numbers * @last: points immediately past the end of array * - * We are freeing all blocks refered from that array (numbers are stored as + * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these @@ -2125,7 +2307,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); } } @@ -2138,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, * @last: pointer immediately past the end of array * @depth: depth of the branches to free * - * We are freeing all blocks refered from these branches (numbers are + * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ @@ -2183,27 +2379,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -2223,9 +2398,34 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); } + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + ext3_free_blocks(handle, inode, nr, 1); if (parent_bh) { @@ -2251,6 +2451,17 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, } } +int ext3_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + /* * ext3_truncate() * @@ -2258,7 +2469,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, * transaction, and VFS/VM ensures that ext3_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -2285,7 +2496,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2293,47 +2503,21 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext3_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; + trace_ext3_truncate_enter(inode); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } + if (!ext3_can_truncate(inode)) + goto out_notrans; + + if (inode->i_size == 0 && ext3_should_writeback_data(inode)) + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } - return; /* AKPM: return what? */ - } + if (IS_ERR(handle)) + goto out_notrans; last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2385,7 +2569,6 @@ void ext3_truncate(struct inode *inode) */ } else { /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); ext3_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); @@ -2442,23 +2625,32 @@ out_stop: * If this was a simple ftruncate(), and the file will remain alive * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by - * ext3_delete_inode(), and we allow that function to clean up the + * ext3_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); + return; +out_notrans: + /* + * Delete the inode from orphan list so that it doesn't stay there + * forever and trigger assertion on umount. + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, unsigned long ino, struct ext3_iloc *iloc) { - unsigned long desc, group_desc, block_group; + unsigned long block_group; unsigned long offset; ext3_fsblk_t block; - struct buffer_head *bh; - struct ext3_group_desc * gdp; + struct ext3_group_desc *gdp; if (!ext3_valid_inum(sb, ino)) { /* @@ -2470,27 +2662,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, } block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - if (block_group >= EXT3_SB(sb)->s_groups_count) { - ext3_error(sb,"ext3_get_inode_block","group >= groups count"); - return 0; - } - smp_rmb(); - group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); - desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); - bh = EXT3_SB(sb)->s_group_desc[group_desc]; - if (!bh) { - ext3_error (sb, "ext3_get_inode_block", - "Descriptor not loaded"); + gdp = ext3_get_group_desc(sb, block_group, NULL); + if (!gdp) return 0; - } - - gdp = (struct ext3_group_desc *)bh->b_data; /* * Figure out the offset within the block group inode table */ offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp[desc].bg_inode_table) + + block = le32_to_cpu(gdp->bg_inode_table) + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); iloc->block_group = block_group; @@ -2515,15 +2695,25 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " "inode=%lu, block="E3FSBLK, inode->i_ino, block); - return -EIO; + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); @@ -2559,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -2593,9 +2783,10 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", @@ -2615,7 +2806,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2661,8 +2852,12 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) struct ext3_inode_info *ei; struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -2671,10 +2866,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT3_I(inode); -#ifdef CONFIG_EXT3_FS_POSIX_ACL - ei->i_acl = EXT3_ACL_NOT_CACHED; - ei->i_default_acl = EXT3_ACL_NOT_CACHED; -#endif ei->i_block_alloc_info = NULL; ret = __ext3_get_inode_loc(inode, &iloc, 0); @@ -2683,20 +2874,22 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -2742,6 +2935,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { /* @@ -2765,7 +2982,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2778,9 +2995,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) + if (ext3_inode_is_fast_symlink(inode)) { inode->i_op = &ext3_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); } @@ -2818,40 +3037,54 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; + uid_t i_uid; + gid_t i_gid; + +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -2867,8 +3100,11 @@ static int ext3_do_update_inode(handle_t *handle, if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -2878,17 +3114,20 @@ static int ext3_do_update_inode(handle_t *handle, /* If this is the first large file * created, add a flag to the superblock. */ + unlock_buffer(bh); err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); if (err) goto out_brelse; + ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; } } } @@ -2911,11 +3150,15 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); @@ -2927,21 +3170,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -2953,13 +3195,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -2968,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); @@ -3001,19 +3248,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; @@ -3028,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) ext3_journal_stop(handle); } + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { handle_t *handle; @@ -3039,23 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } - rc = inode_setattr(inode, attr); + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); + } - /* If inode_setattr's call to ext3_truncate failed to get a - * transaction handle at all, we need to clean up the in-core - * orphan list manually. */ - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); + setattr_copy(inode, attr); + mark_inode_dirty(inode); - if (!rc && (ia_valid & ATTR_MODE)) - rc = ext3_acl_chmod(inode); + if (ia_valid & ATTR_MODE) + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); @@ -3101,12 +3373,12 @@ static int ext3_writepage_trans_blocks(struct inode *inode) if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else - ret = 2 * (bpp + indirects) + 2; + ret = 2 * (bpp + indirects) + indirects + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; @@ -3167,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) { @@ -3182,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); @@ -3195,14 +3460,14 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ -void ext3_dirty_inode(struct inode *inode) +void ext3_dirty_inode(struct inode *inode, int flags) { handle_t *current_handle = ext3_journal_current_handle(); handle_t *handle; @@ -3214,7 +3479,7 @@ void ext3_dirty_inode(struct inode *inode) current_handle->h_transaction != handle->h_transaction) { /* This task has a transaction open against a different fs */ printk(KERN_EMERG "%s: transactions do not match!\n", - __FUNCTION__); + __func__); } else { jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 023a070f55f..4d96e9a6453 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -7,19 +7,14 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/capability.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/time.h> +#include <linux/mount.h> #include <linux/compat.h> -#include <linux/smp_lock.h> #include <asm/uaccess.h> +#include "ext3.h" -int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, - unsigned long arg) +long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = file_inode(filp); struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; unsigned short rsv_window_size; @@ -38,24 +33,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; - if (IS_RDONLY(inode)) - return -EROFS; - - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) return -EFAULT; - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT3_DIRSYNC_FL; + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + err = -EPERM; + if (IS_NOQUOTA(inode)) + goto flags_out; + oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -68,10 +64,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; } /* @@ -79,17 +73,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; } - handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { - mutex_unlock(&inode->i_mutex); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto flags_out; } if (IS_SYNC(inode)) handle->h_sync = 1; @@ -107,14 +98,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } + if (err) + goto flags_out; if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); +flags_out: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GETVERSION: @@ -127,16 +118,23 @@ flags_err: __u32 generation; int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + mutex_lock(&inode->i_mutex); handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = CURRENT_TIME_SEC; @@ -144,32 +142,13 @@ flags_err: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); +setversion_out: + mnt_drop_write_file(filp); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -179,18 +158,24 @@ flags_err: } return -ENOTTY; case EXT3_IOC_SETRSVSZ: { + int err; if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write_file(filp); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setrsvsz_out; + } - if (get_user(rsv_window_size, (int __user *)arg)) - return -EFAULT; + if (get_user(rsv_window_size, (int __user *)arg)) { + err = -EFAULT; + goto setrsvsz_out; + } if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; @@ -208,52 +193,87 @@ flags_err: rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); - return 0; +setrsvsz_out: + mnt_drop_write_file(filp); + return err; } case EXT3_IOC_GROUP_EXTEND: { ext3_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); journal_lock_updates(EXT3_SB(sb)->s_journal); - journal_flush(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - + if (err == 0) + err = err2; +group_extend_out: + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_ADD: { struct ext3_new_group_data input; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write_file(filp); + if (err) + return err; if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } err = ext3_group_add(sb, &input); journal_lock_updates(EXT3_SB(sb)->s_journal); - journal_flush(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - + if (err == 0) + err = err2; +group_add_out: + mnt_drop_write_file(filp); return err; } + case FITRIM: { + + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext3_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + return 0; + } default: return -ENOTTY; @@ -263,9 +283,6 @@ flags_err: #ifdef CONFIG_COMPAT long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT3_IOC32_GETFLAGS: @@ -305,9 +322,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) default: return -ENOIOCTLCMD; } - lock_kernel(); - ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); - return ret; + return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index dec3e0d88ab..f197736dccf 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -24,19 +24,8 @@ * Theodore Ts'o, 2002 */ -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/jbd.h> -#include <linux/time.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/bio.h> - +#include "ext3.h" #include "namei.h" #include "xattr.h" #include "acl.h" @@ -47,7 +36,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, @@ -57,10 +45,14 @@ static struct buffer_head *ext3_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - if ((bh = ext3_bread(handle, inode, *block, 1, err))) { + if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) { inode->i_size += inode->i_sb->s_blocksize; EXT3_I(inode)->i_disksize = inode->i_size; - ext3_journal_get_write_access(handle,bh); + *err = ext3_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } } return bh; } @@ -69,10 +61,6 @@ static struct buffer_head *ext3_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -154,25 +142,26 @@ static void dx_set_count (struct dx_entry *entries, unsigned value); static void dx_set_limit (struct dx_entry *entries, unsigned value); static unsigned dx_root_limit (struct inode *dir, unsigned infosize); static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct dentry *dentry, +static struct dx_frame *dx_probe(struct qstr *entry, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame, int *err); static void dx_release (struct dx_frame *frames); -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); -static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err); +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err); static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -235,13 +224,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - EXT3_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit (struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* @@ -285,7 +274,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -337,7 +326,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(struct dentry *dentry, struct inode *dir, +dx_probe(struct qstr *entry, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) { unsigned count, indirect; @@ -348,15 +337,15 @@ dx_probe(struct dentry *dentry, struct inode *dir, u32 hash; frame->bh = NULL; - if (dentry) - dir = dentry->d_parent->d_inode; - if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) { + *err = ERR_BAD_DX_DIR; goto fail; + } root = (struct dx_root *) bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); @@ -364,13 +353,15 @@ dx_probe(struct dentry *dentry, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; - if (dentry) - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + if (entry) + ext3fs_dirhash(entry->name, entry->len, hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); @@ -379,7 +370,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); @@ -392,7 +383,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -404,7 +395,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -445,11 +436,13 @@ dx_probe(struct dentry *dentry, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) { + *err = ERR_BAD_DX_DIR; goto fail2; + } at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -465,7 +458,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -544,8 +537,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash, * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) return err; /* Failure */ p++; brelse (p->bh); @@ -568,10 +561,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, { struct buffer_head *bh; struct ext3_dir_entry_2 *de, *top; - int err, count = 0; + int err = 0, count = 0; dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); - if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) + + if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err))) return err; de = (struct ext3_dir_entry_2 *) bh->b_data; @@ -582,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); - return count; + /* silently ignore the rest of the block */ + break; } ext3fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -629,9 +620,12 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; + dir = file_inode(dir_file); if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -640,7 +634,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); if (!frame) return err; @@ -703,14 +697,14 @@ errout: * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map (struct ext3_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + size) + while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { ext3fs_dirhash(de->name, de->name_len, &h); @@ -798,15 +792,15 @@ static inline int ext3_match (int len, const char * const name, */ static inline int search_dirblock(struct buffer_head * bh, struct inode *dir, - struct dentry *dentry, + struct qstr *child, unsigned long offset, struct ext3_dir_entry_2 ** res_dir) { struct ext3_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = child->name; + int namelen = child->len; de = (struct ext3_dir_entry_2 *) bh->b_data; dlimit = bh->b_data + dir->i_sb->s_blocksize; @@ -845,29 +839,40 @@ static inline int search_dirblock(struct buffer_head * bh, * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ -static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) +static struct buffer_head *ext3_find_entry(struct inode *dir, + struct qstr *entry, + struct ext3_dir_entry_2 **res_dir) { struct super_block * sb; struct buffer_head * bh_use[NAMEI_RA_SIZE]; struct buffer_head * bh, *ret = NULL; unsigned long start, block, b; + const u8 *name = entry->name; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead buffer */ int num = 0; int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; int namelen; *res_dir = NULL; sb = dir->i_sb; - namelen = dentry->d_name.len; + namelen = entry->len; if (namelen > EXT3_NAME_LEN) return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == 0)) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } if (is_dx(dir)) { - bh = ext3_dx_find_entry(dentry, res_dir, &err); + bh = ext3_dx_find_entry(dir, entry, res_dir, &err); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the @@ -904,8 +909,12 @@ restart: num++; bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; - if (bh) - ll_rw_block(READ_META, 1, &bh); + if (bh && !bh_uptodate_or_lock(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -913,12 +922,12 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext3_error(sb, __FUNCTION__, "reading directory #%lu " + ext3_error(sb, __func__, "reading directory #%lu " "offset %lu", dir->i_ino, block); brelse(bh); goto next; } - i = search_dirblock(bh, dir, dentry, + i = search_dirblock(bh, dir, entry, block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT3_I(dir)->i_dir_start_lookup = block; @@ -952,60 +961,42 @@ cleanup_and_exit: return ret; } -static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err) +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err) { - struct super_block * sb; + struct super_block *sb = dir->i_sb; struct dx_hash_info hinfo; - u32 hash; struct dx_frame frames[2], *frame; - struct ext3_dir_entry_2 *de, *top; struct buffer_head *bh; unsigned long block; int retval; - int namelen = dentry->d_name.len; - const u8 *name = dentry->d_name.name; - struct inode *dir = dentry->d_parent->d_inode; - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ - } - hash = hinfo.hash; + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) + return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err))) goto errout; - de = (struct ext3_dir_entry_2 *) bh->b_data; - top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) - if (ext3_match (namelen, name, de)) { - if (!ext3_check_dir_entry("ext3_find_entry", - dir, de, bh, - (block<<EXT3_BLOCK_SIZE_BITS(sb)) - +((char *)de - bh->b_data))) { - brelse (bh); - *err = ERR_BAD_DX_DIR; - goto errout; - } - *res_dir = de; - dx_release (frames); + + retval = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { + dx_release(frames); return bh; } - brelse (bh); + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + /* Check to see if we should continue to search */ - retval = ext3_htree_next_block(dir, hash, frame, + retval = ext3_htree_next_block(dir, hinfo.hash, frame, frames, NULL); if (retval < 0) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1015,12 +1006,12 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } -static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode; struct ext3_dir_entry_2 * de; @@ -1029,7 +1020,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext3_find_entry(dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { unsigned long ino = le32_to_cpu(de->inode); @@ -1040,8 +1031,12 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (inode == ERR_PTR(-ESTALE)) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } } return d_splice_alias(inode, dentry); } @@ -1050,18 +1045,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext3_get_parent(struct dentry *child) { unsigned long ino; - struct dentry *parent; - struct inode *inode; - struct dentry dotdot; + struct qstr dotdot = QSTR_INIT("..", 2); struct ext3_dir_entry_2 * de; struct buffer_head *bh; - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - dotdot.d_parent = child; /* confusing, isn't it! */ - - bh = ext3_find_entry(&dotdot, &de); - inode = NULL; + bh = ext3_find_entry(child->d_inode, &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1073,16 +1061,7 @@ struct dentry *ext3_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - inode = ext3_iget(child->d_inode->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); } #define S_SHIFT 12 @@ -1129,13 +1108,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) { - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + size) { + while ((char *)de < base + blocksize) { next = ext3_next_entry(de); if (de->inode && de->name_len) { rec_len = EXT3_DIR_REC_LEN(de->name_len); @@ -1166,9 +1146,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { @@ -1367,7 +1347,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); retval = ext3_journal_get_write_access(handle, bh); if (retval) { ext3_std_error(dir->i_sb, retval); @@ -1376,6 +1356,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + + ext3_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext3_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + bh2 = ext3_append (handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); @@ -1384,11 +1377,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; data1 = bh2->b_data; - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)fde + - ext3_rec_len_from_disk(fde->rec_len)); - len = ((char *) root) + blocksize - (char *) de; memcpy (data1, de, len); de = (struct ext3_dir_entry_2 *) data1; top = data1 + len; @@ -1408,6 +1396,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; ext3fs_dirhash(name, namelen, &hinfo); frame = frames; @@ -1415,10 +1405,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1437,7 +1436,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1459,10 +1457,10 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, ext3_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { - bh = ext3_bread(handle, dir, block, 0, &retval); - if(!bh) + for (block = 0; block < blocks; block++) { + if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval))) return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) return retval; @@ -1496,13 +1494,13 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct ext3_dir_entry_2 *de; int err; - frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; entries = frame->entries; at = frame->at; - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err))) goto cleanup; BUFFER_TRACE(bh, "get_write_access"); @@ -1530,7 +1528,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Directory index full!"); err = -ENOSPC; goto cleanup; @@ -1540,8 +1538,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) @@ -1598,7 +1596,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext3_journal_dirty_metadata(handle, frames[0].bh); + err = ext3_journal_dirty_metadata(handle, frames[0].bh); + if (err) + goto journal_error; } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1635,8 +1635,13 @@ static int ext3_delete_entry (handle_t *handle, if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) return -EIO; if (de == de_del) { + int err; + BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + if (pde) pde->rec_len = ext3_rec_len_to_disk( ext3_rec_len_from_disk(pde->rec_len) + @@ -1645,7 +1650,12 @@ static int ext3_delete_entry (handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { +journal_error: + ext3_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext3_rec_len_from_disk(de->rec_len); @@ -1661,10 +1671,12 @@ static int ext3_add_nondir(handle_t *handle, int err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); return 0; } drop_nlink(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -1677,24 +1689,26 @@ static int ext3_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, + bool excl) { handle_t *handle; struct inode * inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; @@ -1709,7 +1723,7 @@ retry: } static int ext3_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1718,17 +1732,19 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1743,28 +1759,68 @@ retry: return err; } -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT3_XATTR_TRANS_BLOCKS); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + inode = ext3_new_inode (handle, dir, NULL, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + d_tmpfile(dentry, inode); + err = ext3_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_unlock_inode: + ext3_journal_stop(handle); + unlock_new_inode(inode); + return err; +} + +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; struct inode * inode; - struct buffer_head * dir_block; + struct buffer_head * dir_block = NULL; struct ext3_dir_entry_2 * de; int err, retries = 0; if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1772,15 +1828,14 @@ retry: inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - drop_nlink(inode); /* is this nlink == 0? */ - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } + if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err))) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); - ext3_journal_get_write_access(handle, dir_block); + err = ext3_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext3_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1794,23 +1849,34 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); + err = ext3_journal_dirty_metadata(handle, dir_block); + if (err) + goto out_clear_inode; + + err = ext3_mark_inode_dirty(handle, inode); + if (!err) + err = ext3_add_entry (handle, dentry, inode); + if (err) { - inode->i_nlink = 0; +out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; } inc_nlink(dir); ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); + err = ext3_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + + unlock_new_inode(inode); d_instantiate(dentry, inode); out_stop: + brelse(dir_block); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -1830,13 +1896,13 @@ static int empty_dir (struct inode * inode) sb = inode->i_sb; if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || - !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) { if (err) - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext3_warning(inode->i_sb, __FUNCTION__, + ext3_warning(inode->i_sb, __func__, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1861,11 +1927,10 @@ static int empty_dir (struct inode * inode) (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { err = 0; brelse (bh); - bh = ext3_bread (NULL, inode, - offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); - if (!bh) { + if (!(bh = ext3_dir_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) { if (err) - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "error %d reading directory" " #%lu offset %lu", err, inode->i_ino, offset); @@ -1905,7 +1970,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1914,9 +1979,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1953,7 +2022,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1971,11 +2040,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2025,7 +2092,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2043,13 +2110,15 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - DQUOT_INIT(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_rmdir; @@ -2100,9 +2169,12 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ - DQUOT_INIT(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2111,7 +2183,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) handle->h_sync = 1; retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_unlink; @@ -2125,7 +2197,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) @@ -2143,6 +2215,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } @@ -2152,41 +2225,86 @@ static int ext3_symlink (struct inode * dir, handle_t *handle; struct inode * inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + handle = ext3_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT3_I(inode)->i_data)) { + if (l > EXT3_N_BLOCKS * 4) { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); /* - * page_symlink() calls into ext3_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; + err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified */ - err = __page_symlink(inode, symname, l, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + set_nlink(inode, 1); + err = ext3_orphan_del(handle, inode); if (err) { + ext3_journal_stop(handle); drop_nlink(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; + goto err_drop_inode; } } else { inode->i_op = &ext3_fast_symlink_inode_operations; @@ -2200,6 +2318,10 @@ out_stop: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext3_link (struct dentry * old_dentry, @@ -2211,16 +2333,12 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; + + dquot_initialize(dir); retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS); + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2229,9 +2347,21 @@ retry: inode->i_ctime = CURRENT_TIME_SEC; inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext3_orphan_del(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -2252,14 +2382,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct inode * old_inode, * new_inode; struct buffer_head * old_bh, * new_bh, * dir_bh; struct ext3_dir_entry_2 * old_de, * new_de; - int retval; + int retval, flush_file = 0; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - DQUOT_INIT(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2269,7 +2402,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) handle->h_sync = 1; - old_bh = ext3_find_entry (old_dentry, &old_de); + old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2282,7 +2415,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext3_find_entry (new_dentry, &new_de); + new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { brelse (new_bh); @@ -2296,7 +2429,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) @@ -2312,14 +2445,20 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; } else { BUFFER_TRACE(new_bh, "get write access"); - ext3_journal_get_write_access(handle, new_bh); + retval = ext3_journal_get_write_access(handle, new_bh); + if (retval) + goto journal_error; new_de->inode = cpu_to_le32(old_inode->i_ino); if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) new_de->file_type = old_de->file_type; new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, new_bh); + retval = ext3_journal_dirty_metadata(handle, new_bh); + if (retval) + goto journal_error; brelse(new_bh); new_bh = NULL; } @@ -2346,7 +2485,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext3_dir_entry_2 *old_de2; - old_bh2 = ext3_find_entry(old_dentry, &old_de2); + old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, + &old_de2); if (old_bh2) { retval = ext3_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -2367,10 +2507,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ext3_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); + retval = ext3_journal_get_write_access(handle, dir_bh); + if (retval) + goto journal_error; PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); + retval = ext3_journal_dirty_metadata(handle, dir_bh); + if (retval) { +journal_error: + ext3_std_error(new_dir->i_sb, retval); + goto end_rename; + } drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -2385,6 +2532,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ext3_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext3_orphan_add(handle, new_inode); + if (ext3_should_writeback_data(new_inode)) + flush_file = 1; } retval = 0; @@ -2393,6 +2542,8 @@ end_rename: brelse (old_bh); brelse (new_bh); ext3_journal_stop(handle); + if (retval == 0 && flush_file) + filemap_flush(old_inode->i_mapping); return retval; } @@ -2408,6 +2559,7 @@ const struct inode_operations ext3_dir_inode_operations = { .mkdir = ext3_mkdir, .rmdir = ext3_rmdir, .mknod = ext3_mknod, + .tmpfile = ext3_tmpfile, .rename = ext3_rename, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR @@ -2416,7 +2568,8 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2427,5 +2580,6 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h index f2ce2b0065c..46304d8c9f0 100644 --- a/fs/ext3/namei.h +++ b/fs/ext3/namei.h @@ -6,3 +6,22 @@ */ extern struct dentry *ext3_get_parent(struct dentry *child); + +static inline struct buffer_head *ext3_dir_bread(handle_t *handle, + struct inode *inode, + int block, int create, + int *err) +{ + struct buffer_head *bh; + + bh = ext3_bread(handle, inode, block, create, err); + + if (!bh && !(*err)) { + *err = -EIO; + ext3_error(inode->i_sb, __func__, + "Directory hole detected on inode %lu\n", + inode->i_ino); + return NULL; + } + return bh; +} diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0e97b6e07cb..27105655502 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -11,10 +11,7 @@ #define EXT3FS_DEBUG -#include <linux/ext3_jbd.h> - -#include <linux/errno.h> -#include <linux/slab.h> +#include "ext3.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) @@ -48,60 +45,60 @@ static int verify_group_input(struct super_block *sb, free_blocks_count, input->reserved_blocks); if (group != sbi->s_groups_count) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Cannot add at group %u (only %lu groups)", input->group, sbi->s_groups_count); else if ((start - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)) - ext3_warning(sb, __FUNCTION__, "Last group not full"); + ext3_warning(sb, __func__, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + ext3_warning(sb, __func__, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", + ext3_warning(sb, __func__, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Cannot read last block ("E3FSBLK")", end - 1); else if (outside(input->block_bitmap, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap not in group (block %u)", input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap not in group (block %u)", input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode table not in group (blocks %u-"E3FSBLK")", input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap same as inode bitmap (%u)", input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap (%u) in inode table (%u-"E3FSBLK")", input->block_bitmap, input->inode_table, itend-1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", input->inode_bitmap, input->inode_table, itend-1); else if (inside(input->block_bitmap, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap (%u) in GDT table" " ("E3FSBLK"-"E3FSBLK")", input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap (%u) in GDT table" " ("E3FSBLK"-"E3FSBLK")", input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode table (%u-"E3FSBLK") overlaps" "GDT table ("E3FSBLK"-"E3FSBLK")", input->inode_table, itend - 1, start, metaend - 1); @@ -119,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext3_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -209,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -237,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto exit_bh; } if ((err = ext3_journal_get_write_access(handle, gdb))) { @@ -249,7 +246,11 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext3_journal_dirty_metadata(handle, gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -266,10 +267,14 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); + goto exit_bh; + } + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); goto exit_bh; } - ext3_journal_dirty_metadata(handle, gdb); ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -295,7 +300,11 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext3_journal_dirty_metadata(handle, it); + err = ext3_journal_dirty_metadata(handle, it); + if (err) { + brelse(it); + goto exit_bh; + } brelse(it); ext3_set_bit(bit, bh->b_data); } @@ -306,7 +315,9 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto exit_bh; brelse(bh); /* Mark unused entries in inode bitmap used */ @@ -319,12 +330,12 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -386,7 +397,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "reserved GDT "E3FSBLK " missing grp %d ("E3FSBLK")", blk, grp, @@ -440,7 +451,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT3_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "won't resize using backup superblock at %llu", (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); return -EPERM; @@ -464,7 +475,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "new group %u GDT block "E3FSBLK" not reserved", input->group, gdblock); err = -EINVAL; @@ -488,7 +499,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext3_warning (sb, __FUNCTION__, + ext3_warning (sb, __func__, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -503,12 +514,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; - ext3_journal_dirty_metadata(handle, dind); + err = ext3_journal_dirty_metadata(handle, dind); + if (err) + goto exit_group_desc; brelse(dind); + dind = NULL; inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - ext3_mark_iloc_dirty(handle, inode, &iloc); + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto exit_group_desc; memset((*primary)->b_data, 0, sb->s_blocksize); - ext3_journal_dirty_metadata(handle, *primary); + err = ext3_journal_dirty_metadata(handle, *primary); + if (err) + goto exit_group_desc; o_group_desc = EXT3_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -519,10 +537,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto exit_inode; return 0; +exit_group_desc: + kfree(n_group_desc); exit_inode: //ext3_journal_release_buffer(handle, iloc.bh); brelse(iloc.bh); @@ -580,13 +602,14 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % + EXT3_ADDR_PER_BLOCK(sb)); end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "reserved block "E3FSBLK " not at offset %ld", blk, @@ -661,11 +684,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -698,22 +722,26 @@ static void update_backups(struct super_block *sb, break; bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext3_debug("update metadata backup %#04lx\n", (unsigned long)bh->b_blocknr); - if ((err = ext3_journal_get_write_access(handle, bh))) + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); brelse(bh); + if (err) + break; } if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -730,7 +758,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't update backup for group %d (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT3_VALID_FS; @@ -770,33 +798,34 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Can't resize non-sparse filesystem further"); return -EPERM; } if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n"); + ext3_warning(sb, __func__, "blocks_count overflow\n"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n"); + ext3_warning(sb, __func__, "inodes_count overflow\n"); return -EINVAL; } if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE)){ - ext3_warning(sb, __FUNCTION__, + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext3_iget(sb, EXT3_RESIZE_INO); if (IS_ERR(inode)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Error opening resize inode"); return PTR_ERR(inode); } @@ -823,9 +852,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; @@ -854,7 +883,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -898,12 +927,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -919,7 +948,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext3_journal_dirty_metadata(handle, primary); + err = ext3_journal_dirty_metadata(handle, primary); + if (err) + goto exit_journal; /* Update the reserved block counts only once the new group is * active. */ @@ -931,11 +962,10 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT3_INODES_PER_GROUP(sb)); - ext3_journal_dirty_metadata(handle, sbi->s_sbh); - sb->s_dirt = 1; + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -962,7 +992,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, ext3_fsblk_t n_blocks_count) { ext3_fsblk_t o_blocks_count; - unsigned long o_groups_count; ext3_grpblk_t last; ext3_grpblk_t add; struct buffer_head * bh; @@ -972,12 +1001,12 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); - o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", + printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK + " up to "E3FSBLK" blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -985,16 +1014,16 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to resize to %lu blocks safely\n", + " too large to resize to "E3FSBLK" blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext3_warning(sb, __FUNCTION__, - "CONFIG_LBD not enabled\n"); + ext3_warning(sb, __func__, + "CONFIG_LBDAF not enabled\n"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -1004,7 +1033,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_BLOCKS_PER_GROUP(sb); if (last == 0) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "need to use ext2online to resize further"); return -EPERM; } @@ -1012,7 +1041,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, add = EXT3_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, "blocks_count overflow"); + ext3_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } @@ -1020,7 +1049,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "will only finish group ("E3FSBLK " blocks, %u new)", o_blocks_count + add, add); @@ -1028,7 +1057,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add -1); if (!bh) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't read last block, resize aborted"); return -ENOSPC; } @@ -1040,36 +1069,42 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, handle = ext3_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext3_warning(sb, __FUNCTION__, "error %d on journal start",err); + ext3_warning(sb, __func__, "error %d on journal start",err); goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + ext3_journal_stop(handle); err = -EBUSY; goto exit_put; } if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - sb->s_dirt = 1; - unlock_super(sb); - ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, - o_blocks_count + add); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + if (err) { + ext3_warning(sb, __func__, + "error %d on journal dirty metadata", err); + ext3_journal_stop(handle); + goto exit_put; + } + ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); - ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, - o_blocks_count + add); + ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); if ((err = ext3_journal_stop(handle))) goto exit_put; if (test_opt(sb, DEBUG)) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index ad536066408..08cdfe5461e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -17,39 +17,39 @@ */ #include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/slab.h> -#include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> -#include <linux/buffer_head.h> #include <linux/exportfs.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/random.h> #include <linux/mount.h> -#include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/log2.h> +#include <linux/cleancache.h> +#include <linux/namei.h> #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS + +#include "ext3.h" #include "xattr.h" #include "acl.h" #include "namei.h" +#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA +#else + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA +#endif + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, unsigned int); -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync); static void ext3_mark_recovery_complete(struct super_block * sb, struct ext3_super_block * es); @@ -60,17 +60,11 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]); static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static void ext3_unlockfs(struct super_block *sb); -static void ext3_write_super (struct super_block * sb); -static void ext3_write_super_lockfs(struct super_block *sb); +static int ext3_unfreeze(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. */ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) { @@ -84,7 +78,7 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) * take the FS itself readonly cleanly. */ journal = EXT3_SB(sb)->s_journal; if (is_journal_aborted(journal)) { - ext3_abort(sb, __FUNCTION__, + ext3_abort(sb, __func__, "Detected aborted journal"); return ERR_PTR(-EROFS); } @@ -92,12 +86,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) return journal_start(journal, nblocks); } -/* - * The only special thing we need to do here is to make sure that all - * journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ int __ext3_journal_stop(const char *where, handle_t *handle) { struct super_block *sb; @@ -130,12 +118,28 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); journal_abort_handle(handle); } +void ext3_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + + va_end(args); +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -147,7 +151,7 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, * write out the superblock safely. * * We'll just use the journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will compain about + * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. */ @@ -164,29 +168,40 @@ static void ext3_handle_error(struct super_block *sb) if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (journal) journal_abort(journal, -EIO); } if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (device %s): panic forced after error\n", + panic("EXT3-fs (%s): panic forced after error\n", sb->s_id); } -void ext3_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); ext3_handle_error(sb); @@ -242,8 +257,7 @@ void __ext3_std_error (struct super_block * sb, const char * function, return; errstr = ext3_decode_error(sb, errno, nbuf); - printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); ext3_handle_error(sb); } @@ -258,42 +272,57 @@ void __ext3_std_error (struct super_block * sb, const char * function, * case we take the easy way out and panic immediately. */ -void ext3_abort (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_abort(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; - printk (KERN_CRIT "ext3_abort called.\n"); - va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs panic from previous error\n"); + panic("EXT3-fs: panic from previous error\n"); if (sb->s_flags & MS_RDONLY) return; - printk(KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; - journal_abort(EXT3_SB(sb)->s_journal, -EIO); + + if (EXT3_SB(sb)->s_journal) + journal_abort(EXT3_SB(sb)->s_journal, -EIO); } -void ext3_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_warning(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ", - sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } @@ -304,10 +333,10 @@ void ext3_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) return; - ext3_warning(sb, __FUNCTION__, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", - EXT3_DYNAMIC_REV); + ext3_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT3_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); @@ -325,42 +354,39 @@ void ext3_update_dynamic_rev(struct super_block *sb) /* * Open the external journal device */ -static struct block_device *ext3_blkdev_get(dev_t dev) +static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; fail: - printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n", - __bdevname(dev, b), PTR_ERR(bdev)); + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; } /* * Release the journal device */ -static int ext3_blkdev_put(struct block_device *bdev) +static void ext3_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +static void ext3_blkdev_remove(struct ext3_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext3_blkdev_put(bdev); + ext3_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -372,13 +398,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) { struct list_head *l; - printk(KERN_ERR "sb orphan head is %d\n", + ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); - printk(KERN_ERR "sb_info orphan list:\n"); + ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); - printk(KERN_ERR " " + ext3_msg(sb, KERN_ERR, " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, @@ -390,10 +416,15 @@ static void ext3_put_super (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; - int i; + int i, err; + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); + err = journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext3_abort(sb, __func__, "Couldn't clean up the journal"); + if (!(sb->s_flags & MS_RDONLY)) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -434,8 +465,8 @@ static void ext3_put_super (struct super_block * sb) ext3_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); - return; } static struct kmem_cache *ext3_inode_cachep; @@ -450,15 +481,27 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); if (!ei) return NULL; -#ifdef CONFIG_EXT3_FS_POSIX_ACL - ei->i_acl = EXT3_ACL_NOT_CACHED; - ei->i_default_acl = EXT3_ACL_NOT_CACHED; -#endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + static void ext3_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT3_I(inode)->i_orphan))) { @@ -469,10 +512,10 @@ static void ext3_destroy_inode(struct inode *inode) false); dump_stack(); } - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); + call_rcu(&inode->i_rcu, ext3_i_callback); } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; @@ -484,7 +527,7 @@ static void init_once(struct kmem_cache * cachep, void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", sizeof(struct ext3_inode_info), @@ -498,38 +541,35 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext3_inode_cachep); } -static void ext3_clear_inode(struct inode *inode) -{ - struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; -#ifdef CONFIG_EXT3_FS_POSIX_ACL - if (EXT3_I(inode)->i_acl && - EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) { - posix_acl_release(EXT3_I(inode)->i_acl); - EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED; - } - if (EXT3_I(inode)->i_default_acl && - EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) { - posix_acl_release(EXT3_I(inode)->i_default_acl); - EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED; - } -#endif - ext3_discard_reservation(inode); - EXT3_I(inode)->i_block_alloc_info = NULL; - if (unlikely(rsv)) - kfree(rsv); -} - static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext3_sb_info *sbi = EXT3_SB(sb); - if (sbi->s_jquota_fmt) - seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -537,22 +577,35 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext3_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -567,13 +620,15 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT3_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT3_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -591,8 +646,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -613,17 +666,18 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } - if (test_opt(sb, BARRIER)) - seq_puts(seq, ",barrier=1"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); + /* + * Always display barrier state so it's clear what the status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); ext3_show_quota_options(seq, sb); @@ -674,47 +728,61 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, ext3_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) -static int ext3_dquot_initialize(struct inode *inode, int type); -static int ext3_dquot_drop(struct inode *inode); static int ext3_write_dquot(struct dquot *dquot); static int ext3_acquire_dquot(struct dquot *dquot); static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); -static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); -static struct dquot_operations ext3_quota_operations = { - .initialize = ext3_dquot_initialize, - .drop = ext3_dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, +static const struct dquot_operations ext3_quota_operations = { .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, .mark_dirty = ext3_mark_dquot_dirty, - .write_info = ext3_write_info + .write_info = ext3_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops ext3_qctl_operations = { +static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; #endif @@ -723,20 +791,20 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, - .delete_inode = ext3_delete_inode, + .drop_inode = ext3_drop_inode, + .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, - .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, - .write_super_lockfs = ext3_write_super_lockfs, - .unlockfs = ext3_unlockfs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, .statfs = ext3_statfs, .remount_fs = ext3_remount, - .clear_inode = ext3_clear_inode, .show_options = ext3_show_options, #ifdef CONFIG_QUOTA .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext3_export_ops = { @@ -752,14 +820,16 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, @@ -785,32 +855,39 @@ static match_table_t tokens = { {Opt_reservation, "reservation"}, {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_grpquota, "grpquota"}, {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_resize, "resize"}, {Opt_err, NULL}, }; -static ext3_fsblk_t get_sb_block(void **data) +static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) { ext3_fsblk_t sb_block; char *options = (char *) *data; @@ -821,7 +898,7 @@ static ext3_fsblk_t get_sb_block(void **data) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - printk("EXT3-fs: Invalid sb specification: %s\n", + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -831,6 +908,65 @@ static ext3_fsblk_t get_sb_block(void **data) return sb_block; } +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype]) { + int same = !strcmp(sbi->s_qf_names[qtype], qname); + + kfree(qname); + if (!same) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + } + return same; + } + if (strchr(qname, '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + if (sbi->s_qf_names[qtype]) { + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + } + return 1; +} +#endif + static int parse_options (char *options, struct super_block *sb, unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) @@ -840,9 +976,15 @@ static int parse_options (char *options, struct super_block *sb, substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; + kuid_t uid; + kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + #ifdef CONFIG_QUOTA - int qtype; - char *qname; + int qfmt; #endif if (!options) @@ -852,7 +994,11 @@ static int parse_options (char *options, struct super_block *sb, int token; if (!*p) continue; - + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -870,12 +1016,23 @@ static int parse_options (char *options, struct super_block *sb, case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return 0; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return 0; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -906,10 +1063,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: @@ -921,7 +1080,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT3 (no)user_xattr options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)user_xattr options not supported"); break; #endif #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -934,7 +1094,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk("EXT3 (no)acl options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)acl options not supported"); break; #endif case Opt_reservation: @@ -950,16 +1111,16 @@ static int parse_options (char *options, struct super_block *sb, user to specify an existing inode to be the journal file. */ if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_inum: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -968,14 +1129,49 @@ static int parse_options (char *options, struct super_block *sb, break; case Opt_journal_dev: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) return 0; *journal_devnum = option; break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; @@ -998,80 +1194,62 @@ static int parse_options (char *options, struct super_block *sb, data_opt = EXT3_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - != data_opt) { - printk(KERN_ERR - "EXT3-fs: cannot change data " - "mode on remount\n"); - return 0; - } + if (test_opt(sb, DATA_FLAGS) == data_opt) + break; + ext3_msg(sb, KERN_ERR, + "error: cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.", + data_mode_string(test_opt(sb, + DATA_FLAGS)), + data_mode_string(data_opt)); + return 0; } else { - sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_enabled(sb)) { - printk(KERN_ERR - "EXT3-fs: Cannot change journalled " - "quota options when quota turned on.\n"); - return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - printk(KERN_ERR - "EXT3-fs: not enough memory for " - "storing quotafile name.\n"); - return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - printk(KERN_ERR - "EXT3-fs: %s quota file already " - "specified.\n", QTYPE2NAME(qtype)); - kfree(qname); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - printk(KERN_ERR - "EXT3-fs: quotafile must be on " - "filesystem root.\n"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_enabled(sb)) { - printk(KERN_ERR "EXT3-fs: Cannot change " - "journalled quota options when " - "quota turned on.\n"); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; - break; + qfmt = QFMT_VFS_OLD; + goto set_qf_format; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; +set_qf_format: + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != qfmt) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "journaled quota options when " + "quota turned on."); + return 0; + } + sbi->s_jquota_fmt = qfmt; break; case Opt_quota: case Opt_usrquota: @@ -1083,9 +1261,9 @@ clear_qf_name: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb)) { - printk(KERN_ERR "EXT3-fs: Cannot change quota " - "options when quota turned on.\n"); + if (sb_any_quota_loaded(sb)) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "quota options when quota turned on."); return 0; } clear_opt(sbi->s_mount_opt, QUOTA); @@ -1096,15 +1274,19 @@ clear_qf_name: case Opt_quota: case Opt_usrquota: case Opt_grpquota: + ext3_msg(sb, KERN_ERR, + "error: quota options not supported."); + break; case Opt_usrjquota: case Opt_grpjquota: case Opt_offusrjquota: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: - printk(KERN_ERR - "EXT3-fs: journalled quota options not " - "supported.\n"); + case Opt_jqfmt_vfsv1: + ext3_msg(sb, KERN_ERR, + "error: journaled quota options not " + "supported."); break; case Opt_noquota: break; @@ -1112,9 +1294,15 @@ clear_qf_name: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_barrier: - if (match_int(&args[0], &option)) - return 0; + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1124,8 +1312,9 @@ clear_qf_name: break; case Opt_resize: if (!is_remount) { - printk("EXT3-fs: resize option only available " - "for remount\n"); + ext3_msg(sb, KERN_ERR, + "error: resize option only available " + "for remount"); return 0; } if (match_int(&args[0], &option) != 0) @@ -1133,47 +1322,43 @@ clear_qf_name: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); break; default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + ext3_msg(sb, KERN_ERR, + "error: unrecognized mount option \"%s\" " + "or missing value", p); return 0; } } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - - if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { - printk(KERN_ERR "EXT3-fs: old and new quota " - "format mixing.\n"); + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext3_msg(sb, KERN_ERR, "error: old and new quota " + "format mixing."); return 0; } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " - "not specified.\n"); + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "not specified."); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " - "specified with no journalling " - "enabled.\n"); + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "specified with no journaling " + "enabled."); return 0; } } @@ -1188,39 +1373,41 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { - printk (KERN_ERR "EXT3-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext3_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT3_VALID_FS)) - printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT3_ERROR_FS)) - printk (KERN_WARNING - "EXT3-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + ext3_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= - (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk (KERN_WARNING - "EXT3-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + le16_to_cpu(es->s_max_mnt_count)) + ext3_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk (KERN_WARNING - "EXT3-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); #if 0 /* @@@ We _will_ want to clear the valid bit if we find inconsistencies, to force a fsck at reboot. But for a plain journaled filesystem we can keep it set as valid forever! :) */ - es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); + es->s_state &= cpu_to_le16(~EXT3_VALID_FS); #endif - if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); @@ -1229,23 +1416,22 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ext3_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", sb->s_blocksize, sbi->s_groups_count, EXT3_BLOCKS_PER_GROUP(sb), EXT3_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id); if (EXT3_SB(sb)->s_journal->j_inode == NULL) { char b[BDEVNAME_SIZE]; - - printk("external journal on %s\n", + ext3_msg(sb, KERN_INFO, "using external journal on %s", bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); } else { - printk("internal journal\n"); + ext3_msg(sb, KERN_INFO, "using internal journal"); } + cleancache_init_fs(sb); return res; } @@ -1253,14 +1439,14 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, static int ext3_check_descriptors(struct super_block *sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); - ext3_fsblk_t last_block; int i; ext3_debug ("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); + ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); + ext3_fsblk_t last_block; if (i == sbi->s_groups_count - 1) last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; @@ -1299,7 +1485,6 @@ static int ext3_check_descriptors(struct super_block *sb) le32_to_cpu(gdp->bg_inode_table)); return 0; } - first_block += EXT3_BLOCKS_PER_GROUP(sb); } sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); @@ -1339,23 +1524,31 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (bdev_read_only(sb->s_bdev)) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, skipping orphan cleanup.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, skipping orphan cleanup."); + return; + } + + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); return; } if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", - sb->s_id); + ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~MS_RDONLY; } #ifdef CONFIG_QUOTA @@ -1366,9 +1559,9 @@ static void ext3_orphan_cleanup (struct super_block * sb, if (EXT3_SB(sb)->s_qf_names[i]) { int ret = ext3_quota_on_mount(sb, i); if (ret < 0) - printk(KERN_ERR - "EXT3-fs: Cannot turn on journalled " - "quota: error %d\n", ret); + ext3_msg(sb, KERN_ERR, + "error: cannot turn on journaled " + "quota: %d", ret); } } #endif @@ -1383,11 +1576,11 @@ static void ext3_orphan_cleanup (struct super_block * sb, } list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - DQUOT_INIT(inode); + dquot_initialize(inode); if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", - __FUNCTION__, inode->i_ino, inode->i_size); + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %Ld bytes\n", inode->i_ino, inode->i_size); ext3_truncate(inode); @@ -1395,7 +1588,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } else { printk(KERN_DEBUG "%s: deleting unreferenced inode %lu\n", - __FUNCTION__, inode->i_ino); + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -1406,16 +1599,16 @@ static void ext3_orphan_cleanup (struct super_block * sb, #define PLURAL(x) (x), ((x)==1) ? "" : "s" if (nr_orphans) - printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", - sb->s_id, PLURAL(nr_orphans)); + ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); if (nr_truncates) - printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", - sb->s_id, PLURAL(nr_truncates)); + ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i); + dquot_quota_off(sb, i); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -1494,7 +1687,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) struct ext3_super_block *es = NULL; struct ext3_sb_info *sbi; ext3_fsblk_t block; - ext3_fsblk_t sb_block = get_sb_block(&data); + ext3_fsblk_t sb_block = get_sb_block(&data, sb); ext3_fsblk_t logic_sb_block; unsigned long offset = 0; unsigned int journal_inum = 0; @@ -1513,17 +1706,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; sbi->s_sb_block = sb_block; - unlock_kernel(); - blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); + ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); goto out_fail; } @@ -1539,14 +1734,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logic_sb_block))) { - printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); goto out_fail; } /* * Note: s_es must be initialized as soon as possible because * some ext3 macro-instructions depend on its value */ - es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext3_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT3_SUPER_MAGIC) @@ -1569,11 +1764,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -1582,9 +1777,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + /* enable barriers by default */ + set_opt(sbi->s_mount_opt, BARRIER); set_opt(sbi->s_mount_opt, RESERVATION); if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, @@ -1592,15 +1789,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk(KERN_WARNING - "EXT3-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -1608,59 +1805,60 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) */ features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); if (features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount RDWR because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT3_MIN_BLOCK_SIZE || blocksize > EXT3_MAX_BLOCK_SIZE) { - printk(KERN_ERR - "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", - blocksize, sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "filesystem blocksize %d", blocksize); goto failed_mount; } - hblock = bdev_hardsect_size(sb->s_bdev); + hblock = bdev_logical_block_size(sb->s_bdev); if (sb->s_blocksize != blocksize) { /* * Make sure the blocksize for the filesystem is larger * than the hardware sectorsize for the machine. */ if (blocksize < hblock) { - printk(KERN_ERR "EXT3-fs: blocksize %d too small for " - "device blocksize %d.\n", blocksize, hblock); + ext3_msg(sb, KERN_ERR, + "error: fsblocksize %d too small for " + "hardware sectorsize %d", blocksize, hblock); goto failed_mount; } brelse (bh); if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n", - blocksize); + ext3_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto out_fail; } logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if (!bh) { - printk(KERN_ERR - "EXT3-fs: Can't read superblock on 2nd try.\n"); + ext3_msg(sb, KERN_ERR, + "error: can't read superblock on 2nd try"); goto failed_mount; } - es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); + es = (struct ext3_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - printk (KERN_ERR - "EXT3-fs: Magic mismatch, very weird !\n"); + ext3_msg(sb, KERN_ERR, + "error: magic mismatch"); goto failed_mount; } } @@ -1676,8 +1874,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk (KERN_ERR - "EXT3-fs: unsupported inode size: %d\n", + ext3_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -1685,8 +1883,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << le32_to_cpu(es->s_log_frag_size); if (blocksize != sbi->s_frag_size) { - printk(KERN_ERR - "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + ext3_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %u (unsupported)", sbi->s_frag_size, blocksize); goto failed_mount; } @@ -1709,33 +1907,46 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #blocks per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_frags_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #fragments per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", sbi->s_frags_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #inodes per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } - if (le32_to_cpu(es->s_blocks_count) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to mount safely\n", sb->s_id); + err = generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count)); + if (err) { + ext3_msg(sb, KERN_ERR, + "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) - printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not " - "enabled\n"); + ext3_msg(sb, KERN_ERR, + "error: CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -1744,50 +1955,37 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT3_BLOCKS_PER_GROUP(sb)) + 1; - db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / - EXT3_DESC_PER_BLOCK(sb); + db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk (KERN_ERR "EXT3-fs: not enough memory\n"); + ext3_msg(sb, KERN_ERR, + "error: not enough memory"); + ret = -ENOMEM; goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk (KERN_ERR "EXT3-fs: " - "can't read group descriptor %d\n", i); + ext3_msg(sb, KERN_ERR, + "error: can't read group descriptor %d", i); db_count = i; goto failed_mount2; } } if (!ext3_check_descriptors (sb)) { - printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n"); + ext3_msg(sb, KERN_ERR, + "error: group descriptors corrupted"); goto failed_mount2; } sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - } - if (err) { - printk(KERN_ERR "EXT3-fs: insufficient memory\n"); - goto failed_mount3; - } - /* per fileystem reservation list head & lock */ spin_lock_init(&sbi->s_rsv_window_lock); sbi->s_rsv_window_root = RB_ROOT; @@ -1811,7 +2009,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; @@ -1826,15 +2027,30 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (!test_opt(sb, NOLOAD) && EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount3; + goto failed_mount2; } else if (journal_inum) { if (ext3_create_journal(sb, es, journal_inum)) - goto failed_mount3; + goto failed_mount2; } else { if (!silent) - printk (KERN_ERR - "ext3: No journal on filesystem on %s\n", - sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: no journal found. " + "mounting ext3 over ext2?"); + goto failed_mount2; + } + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + } + if (err) { + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); + ret = err; goto failed_mount3; } @@ -1847,7 +2063,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) cope, else JOURNAL_DATA */ if (journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); else set_opt(sbi->s_mount_opt, JOURNAL_DATA); break; @@ -1856,21 +2072,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) case EXT3_MOUNT_WRITEBACK_DATA: if (!journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { - printk(KERN_ERR "EXT3-fs: Journal does not support " - "requested data journaling mode\n"); - goto failed_mount4; + ext3_msg(sb, KERN_ERR, + "error: journal does not support " + "requested data journaling mode"); + goto failed_mount3; } default: break; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - " - "its supported only with writeback mode\n"); - clear_opt(sbi->s_mount_opt, NOBH); - } - } /* * The journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -1878,58 +2088,51 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) root = ext3_iget(sb, EXT3_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + ext3_msg(sb, KERN_ERR, "error: get root inode failed"); ret = PTR_ERR(root); - goto failed_mount4; + goto failed_mount3; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); - goto failed_mount4; + ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); + goto failed_mount3; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - printk(KERN_ERR "EXT3-fs: get root dentry failed\n"); - iput(root); + ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); ret = -ENOMEM; - goto failed_mount4; + goto failed_mount3; } - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ + if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) - printk (KERN_INFO "EXT3-fs: recovery complete.\n"); - ext3_mark_recovery_complete(sb, es); - printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + if (needs_recovery) { + ext3_mark_recovery_complete(sb, es); + ext3_msg(sb, KERN_INFO, "recovery complete"); + } + ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - lock_kernel(); return 0; cantfind_ext3: if (!silent) - printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n", + ext3_msg(sb, KERN_INFO, + "error: can't find ext3 filesystem on dev %s.", sb->s_id); goto failed_mount; -failed_mount4: - journal_destroy(sbi->s_journal); failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + journal_destroy(sbi->s_journal); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -1943,8 +2146,8 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); - lock_kernel(); return ret; } @@ -1968,6 +2171,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_BARRIER; else journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } @@ -1983,27 +2190,27 @@ static journal_t *ext3_get_journal(struct super_block *sb, journal_inode = ext3_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { - printk(KERN_ERR "EXT3-fs: no journal found.\n"); + ext3_msg(sb, KERN_ERR, "error: no journal found"); return NULL; } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); - printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); return NULL; } jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { - printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); iput(journal_inode); return NULL; } journal = journal_init_inode(journal_inode); if (!journal) { - printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); + ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); iput(journal_inode); return NULL; } @@ -2025,22 +2232,15 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, struct ext3_super_block * es; struct block_device *bdev; - bdev = ext3_blkdev_get(j_dev); + bdev = ext3_blkdev_get(j_dev, sb); if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - printk(KERN_ERR - "EXT3: failed to claim external journal device.\n"); - blkdev_put(bdev); - return NULL; - } - blocksize = sb->s_blocksize; - hblock = bdev_hardsect_size(bdev); + hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { - printk(KERN_ERR - "EXT3-fs: blocksize too small for journal device.\n"); + ext3_msg(sb, KERN_ERR, + "error: blocksize too small for journal device"); goto out_bdev; } @@ -2048,23 +2248,23 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, offset = EXT3_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); if (!(bh = __bread(bdev, sb_block, blocksize))) { - printk(KERN_ERR "EXT3-fs: couldn't read superblock of " - "external journal\n"); + ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " + "external journal"); goto out_bdev; } - es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext3_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { - printk(KERN_ERR "EXT3-fs: external journal has " - "bad superblock\n"); + ext3_msg(sb, KERN_ERR, "error: external journal has " + "bad superblock"); brelse(bh); goto out_bdev; } if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); brelse(bh); goto out_bdev; } @@ -2076,19 +2276,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, journal = journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); if (!journal) { - printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + ext3_msg(sb, KERN_ERR, + "error: failed to create device journal"); goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { - printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); - goto out_journal; + if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { + if (bh_submit_read(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - printk(KERN_ERR "EXT3-fs: External journal has more than one " - "user (unsupported) - %d\n", + ext3_msg(sb, KERN_ERR, + "error: external journal has more than one " + "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } @@ -2114,8 +2316,8 @@ static int ext3_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { - printk(KERN_INFO "EXT3-fs: external journal device major/minor " - "numbers have changed\n"); + ext3_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); @@ -2130,21 +2332,21 @@ static int ext3_load_journal(struct super_block *sb, if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: INFO: recovery " - "required on readonly filesystem.\n"); + ext3_msg(sb, KERN_INFO, + "recovery required on readonly filesystem"); if (really_read_only) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, cannot proceed.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, cannot proceed"); return -EROFS; } - printk (KERN_INFO "EXT3-fs: write access will " - "be enabled during recovery.\n"); + ext3_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); } } if (journal_inum && journal_dev) { - printk(KERN_ERR "EXT3-fs: filesystem has both journal " - "and inode journals!\n"); + ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " + "and inode journals"); return -EINVAL; } @@ -2156,10 +2358,13 @@ static int ext3_load_journal(struct super_block *sb, return -EINVAL; } + if (!(journal->j_flags & JFS_BARRIER)) + printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = journal_update_format(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + ext3_msg(sb, KERN_ERR, "error updating journal"); journal_destroy(journal); return err; } @@ -2171,7 +2376,7 @@ static int ext3_load_journal(struct super_block *sb, err = journal_load(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + ext3_msg(sb, KERN_ERR, "error loading journal"); journal_destroy(journal); return err; } @@ -2179,10 +2384,9 @@ static int ext3_load_journal(struct super_block *sb, EXT3_SB(sb)->s_journal = journal; ext3_clear_journal_err(sb, es); - if (journal_devnum && + if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2191,16 +2395,17 @@ static int ext3_load_journal(struct super_block *sb, return 0; } -static int ext3_create_journal(struct super_block * sb, - struct ext3_super_block * es, +static int ext3_create_journal(struct super_block *sb, + struct ext3_super_block *es, unsigned int journal_inum) { journal_t *journal; int err; if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " - "create journal.\n"); + ext3_msg(sb, KERN_ERR, + "error: readonly filesystem when trying to " + "create journal"); return -EROFS; } @@ -2208,12 +2413,12 @@ static int ext3_create_journal(struct super_block * sb, if (!journal) return -EINVAL; - printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", + ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", journal_inum); err = journal_create(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + ext3_msg(sb, KERN_ERR, "error creating journal"); journal_destroy(journal); return -EIO; } @@ -2225,7 +2430,6 @@ static int ext3_create_journal(struct super_block * sb, EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2233,21 +2437,56 @@ static int ext3_create_journal(struct super_block * sb, return 0; } -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync) { struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; - es->s_wtime = cpu_to_le32(get_seconds()); + return error; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext3_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - if (sync) - sync_dirty_buffer(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + ext3_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; } @@ -2262,15 +2501,16 @@ static void ext3_mark_recovery_complete(struct super_block * sb, journal_t *journal = EXT3_SB(sb)->s_journal; journal_lock_updates(journal); - journal_flush(journal); - lock_super(sb); + if (journal_flush(journal) < 0) + goto out; + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - sb->s_dirt = 0; ext3_commit_super(sb, es, 1); } - unlock_super(sb); + +out: journal_unlock_updates(journal); } @@ -2279,8 +2519,8 @@ static void ext3_mark_recovery_complete(struct super_block * sb, * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es) +static void ext3_clear_journal_err(struct super_block *sb, + struct ext3_super_block *es) { journal_t *journal; int j_errno; @@ -2298,9 +2538,9 @@ static void ext3_clear_journal_err(struct super_block * sb, char nbuf[16]; errstr = ext3_decode_error(sb, j_errno, nbuf); - ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " + ext3_warning(sb, __func__, "Filesystem error recorded " "from previous mount: %s", errstr); - ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + ext3_warning(sb, __func__, "Marking fs in need of " "filesystem check."); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; @@ -2324,32 +2564,20 @@ int ext3_force_commit(struct super_block *sb) return 0; journal = EXT3_SB(sb)->s_journal; - sb->s_dirt = 0; ret = ext3_journal_force_commit(journal); return ret; } -/* - * Ext3 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this - * point. Just start an async writeback to get the buffers on their way - * to the disk. - * - * This implicitly triggers the writebehind on sync(). - */ - -static void ext3_write_super (struct super_block * sb) -{ - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; -} - static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; - sb->s_dirt = 0; + trace_ext3_sync_fs(sb, wait); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -2361,37 +2589,51 @@ static int ext3_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext3_write_super_lockfs(struct super_block *sb) +static int ext3_freeze(struct super_block *sb) { - sb->s_dirt = 0; + int error = 0; + journal_t *journal; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT3_SB(sb)->s_journal; + journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ journal_lock_updates(journal); - journal_flush(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + error = journal_flush(journal); + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; + +out: + journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext3_unlockfs(struct super_block *sb) +static int ext3_unfreeze(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) { - lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - unlock_super(sb); journal_unlock_updates(EXT3_SB(sb)->s_journal); } + return 0; } static int ext3_remount (struct super_block * sb, int * flags, char * data) @@ -2401,11 +2643,14 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) ext3_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext3_mount_options old_opts; + int enable_quota = 0; int err; #ifdef CONFIG_QUOTA int i; #endif + sync_filesystem(sb); + /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; @@ -2415,7 +2660,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + int j; + + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif /* @@ -2426,11 +2682,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; } - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + if (test_opt(sb, ABORT)) + ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -2438,12 +2694,16 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + if (test_opt(sb, ABORT)) { err = -EROFS; goto restore_opts; } if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount @@ -2459,21 +2719,15 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - unlock_super(sb); ext3_mark_recovery_complete(sb, es); - lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " - "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + ext3_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR " + "because of unsupported optional " + "features (%x)", le32_to_cpu(ret)); err = -EROFS; goto restore_opts; } @@ -2481,14 +2735,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " + ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead.\n", - sb->s_id); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } @@ -2505,15 +2758,16 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; } } #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); #endif + if (enable_quota) + dquot_resume(sb, -1); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2524,9 +2778,7 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif @@ -2575,6 +2827,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * bitmap, and an inode table. */ overhead += ngroups * (2 + sbi->s_itb_per_group); + + /* Add the journal blocks as well */ + overhead += sbi->s_journal->j_maxlen; + sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); @@ -2584,13 +2840,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT3_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); @@ -2604,7 +2858,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Process 1 Process 2 * ext3_create() quota_sync() * journal_start() write_dquot() - * DQUOT_INIT() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) journal_start() * */ @@ -2613,39 +2867,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) static inline struct inode *dquot_to_inode(struct dquot *dquot) { - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; -} - -static int ext3_dquot_initialize(struct inode *inode, int type) -{ - handle_t *handle; - int ret, err; - - /* We may create quota structure so we need to reserve enough blocks */ - handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_initialize(inode, type); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_dquot_drop(struct inode *inode) -{ - handle_t *handle; - int ret, err; - - /* We may delete quota structure so we need to reserve enough blocks */ - handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_drop(inode); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext3_write_dquot(struct dquot *dquot) @@ -2703,7 +2925,7 @@ static int ext3_release_dquot(struct dquot *dquot) static int ext3_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); @@ -2735,45 +2957,55 @@ static int ext3_write_info(struct super_block *sb, int type) */ static int ext3_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], - EXT3_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path) + struct path *path) { int err; - struct nameidata nd; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* Not journalling quota? */ - if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] && - !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) - return vfs_quota_on(sb, type, format_id, path); - err = path_lookup(path, LOOKUP_FOLLOW, &nd); - if (err) - return err; + /* Quotafile not on the same filesystem? */ - if (nd.path.mnt->mnt_sb != sb) { - path_put(&nd.path); + if (path->dentry->d_sb != sb) return -EXDEV; + /* Journaling quota? */ + if (EXT3_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext3_msg(sb, KERN_WARNING, + "warning: Quota file not on filesystem root. " + "Journaled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext3_should_journal_data(path->dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + journal_lock_updates(EXT3_SB(sb)->s_journal); + err = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err) + return err; } - /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - printk(KERN_WARNING - "EXT3-fs: Quota file not on filesystem root. " - "Journalled quota will not work.\n"); - path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path); + + return dquot_quota_on(sb, type, format_id, path); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and noone else should touch the files) + * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) @@ -2820,80 +3052,79 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { - printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + ext3_msg(sb, KERN_WARNING, + "warning: quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.", (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) + if (err) return err; - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; } inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif -static int ext3_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext3_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); } static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext3_get_sb, + .mount = ext3_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); static int __init init_ext3_fs(void) { diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index ff7b4ccd898..6b01c3eab1f 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -17,10 +17,8 @@ * ext3 symlink handling code */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> #include <linux/namei.h> +#include "ext3.h" #include "xattr.h" static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) @@ -34,6 +32,7 @@ const struct inode_operations ext3_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -45,6 +44,7 @@ const struct inode_operations ext3_symlink_inode_operations = { const struct inode_operations ext3_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext3_follow_link, + .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index a6ea4d6a8bb..c6874be6d58 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -50,14 +50,9 @@ * by the buffer lock. */ -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/mbcache.h> #include <linux/quotaops.h> -#include <linux/rwsem.h> #include "xattr.h" #include "acl.h" @@ -99,14 +94,16 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext3_xattr_rehash(struct ext3_xattr_header *, struct ext3_xattr_entry *); +static int ext3_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); static struct mb_cache *ext3_xattr_cache; -static struct xattr_handler *ext3_xattr_handler_map[] = { +static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_SECURITY @@ -114,12 +111,12 @@ static struct xattr_handler *ext3_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext3_xattr_handlers[] = { +const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - &ext3_xattr_acl_access_handler, - &ext3_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT3_FS_SECURITY &ext3_xattr_security_handler, @@ -127,10 +124,10 @@ struct xattr_handler *ext3_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext3_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) handler = ext3_xattr_handler_map[name_index]; @@ -145,7 +142,7 @@ ext3_xattr_handler(int name_index) ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext3_xattr_list(dentry->d_inode, buffer, size); + return ext3_xattr_list(dentry, buffer, size); } static int @@ -232,7 +229,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { -bad_block: ext3_error(inode->i_sb, __FUNCTION__, +bad_block: ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; @@ -272,7 +269,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return -ENODATA; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -330,19 +327,20 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext3_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -355,8 +353,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, } static int -ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -374,14 +373,14 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; } ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -390,15 +389,16 @@ cleanup: } static int -ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return 0; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -409,7 +409,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext3_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext3_xattr_list_entries(inode, IFIRST(header), + error = ext3_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -427,13 +427,13 @@ cleanup: * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -int -ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +static int +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(inode)->xattr_sem); - i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -441,11 +441,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext3_xattr_block_list(inode, buffer, buffer_size); + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(inode)->xattr_sem); + up_read(&EXT3_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } @@ -461,7 +461,6 @@ static void ext3_xattr_update_super_block(handle_t *handle, if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); - sb->s_dirt = 1; ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); } } @@ -496,7 +495,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode, error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; - DQUOT_FREE_BLOCK(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -649,7 +648,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext3_xattr_check_block(bs->bh)) { - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; @@ -771,8 +770,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (DQUOT_ALLOC_BLOCK(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext3_journal_get_write_access(handle, new_bh); @@ -797,21 +796,27 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext3_fsblk_t goal = le32_to_cpu( - EXT3_SB(sb)->s_es->s_first_data_block) + - (ext3_fsblk_t)EXT3_I(inode)->i_block_group * - EXT3_BLOCKS_PER_GROUP(sb); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t goal = ext3_group_first_block_no(sb, + EXT3_I(inode)->i_block_group); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { getblk_failed: ext3_free_blocks(handle, inode, block, 1); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); @@ -848,11 +853,11 @@ cleanup: return error; cleanup_dquot: - DQUOT_FREE_BLOCK(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -880,7 +885,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { error = ext3_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -912,10 +917,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext3_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); } return 0; } @@ -923,7 +928,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext3_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE @@ -961,10 +966,14 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); } error = ext3_xattr_ibody_find(inode, &i, &is); @@ -986,9 +995,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext3_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext3_xattr_ibody_set(handle, inode, &i, &is); @@ -1000,6 +1006,11 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, i.value = NULL; error = ext3_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { + if (EXT3_I(inode)->i_file_acl && !bs.s.base) { + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } error = ext3_xattr_block_set(handle, inode, &i, &bs); if (error) goto cleanup; @@ -1081,14 +1092,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); if (!bh) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: block "E3FSBLK" read error", inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -1126,12 +1137,12 @@ ext3_xattr_cache_insert(struct buffer_head *bh) struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext3_xattr_cache); + ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS); if (!ce) { ea_bdebug(bh, "out of memory"); return; } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); if (error) { mb_cache_entry_free(ce); if (error == -EBUSY) { @@ -1203,8 +1214,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, - inode->i_sb->s_bdev, hash); + ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev, + hash); while (ce) { struct buffer_head *bh; @@ -1215,7 +1226,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= @@ -1229,7 +1240,7 @@ again: return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); } return NULL; } @@ -1305,9 +1316,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header, int __init init_ext3_xattr(void) { - ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, - sizeof(struct mb_cache_entry) + - sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6); + ext3_xattr_cache = mb_cache_create("ext3_xattr", 6); if (!ext3_xattr_cache) return -ENOMEM; return 0; diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 6b1ae1c6182..32e93ebf803 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -58,16 +58,13 @@ struct ext3_xattr_entry { # ifdef CONFIG_EXT3_FS_XATTR -extern struct xattr_handler ext3_xattr_user_handler; -extern struct xattr_handler ext3_xattr_trusted_handler; -extern struct xattr_handler ext3_xattr_acl_access_handler; -extern struct xattr_handler ext3_xattr_acl_default_handler; -extern struct xattr_handler ext3_xattr_security_handler; +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext3_xattr_list(struct inode *, char *, size_t); extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); @@ -77,7 +74,7 @@ extern void ext3_xattr_put_super(struct super_block *); extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); -extern struct xattr_handler *ext3_xattr_handlers[]; +extern const struct xattr_handler *ext3_xattr_handlers[]; # else /* CONFIG_EXT3_FS_XATTR */ @@ -89,12 +86,6 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static inline int -ext3_xattr_list(struct inode *inode, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - -static inline int ext3_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t size, int flags) { @@ -135,10 +126,10 @@ exit_ext3_xattr(void) #ifdef CONFIG_EXT3_FS_SECURITY extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 821efaf2b94..722c2bf9645 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -3,19 +3,15 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> #include <linux/security.h> +#include "ext3.h" #include "xattr.h" static size_t -ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -28,47 +24,53 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext3_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } -struct xattr_handler ext3_xattr_security_handler = { +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + +const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, .get = ext3_xattr_security_get, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index 0327497a55c..d75727cc67f 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -5,21 +5,14 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> -#include <linux/string.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t -ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) @@ -34,26 +27,26 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext3_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } -struct xattr_handler ext3_xattr_trusted_handler = { +const struct xattr_handler ext3_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext3_xattr_trusted_list, .get = ext3_xattr_trusted_get, diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 1abd8f92c44..5612af3567e 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -5,23 +5,17 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t -ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -33,29 +27,30 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, buffer, size); } static int -ext3_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, value, size, flags); } -struct xattr_handler ext3_xattr_user_handler = { +const struct xattr_handler ext3_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext3_xattr_user_list, .get = ext3_xattr_user_get, |
