aboutsummaryrefslogtreecommitdiff
path: root/fs/ext3
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext3')
-rw-r--r--fs/ext3/acl.c293
-rw-r--r--fs/ext3/acl.h13
-rw-r--r--fs/ext3/balloc.c351
-rw-r--r--fs/ext3/bitmap.c16
-rw-r--r--fs/ext3/dir.c375
-rw-r--r--fs/ext3/ext3.h1326
-rw-r--r--fs/ext3/ext3_jbd.c2
-rw-r--r--fs/ext3/file.c20
-rw-r--r--fs/ext3/fsync.c38
-rw-r--r--fs/ext3/hash.c8
-rw-r--r--fs/ext3/ialloc.c97
-rw-r--r--fs/ext3/inode.c510
-rw-r--r--fs/ext3/ioctl.c87
-rw-r--r--fs/ext3/namei.c400
-rw-r--r--fs/ext3/namei.h19
-rw-r--r--fs/ext3/resize.c84
-rw-r--r--fs/ext3/super.c392
-rw-r--r--fs/ext3/symlink.c4
-rw-r--r--fs/ext3/xattr.c33
-rw-r--r--fs/ext3/xattr.h6
-rw-r--r--fs/ext3/xattr_security.c43
-rw-r--r--fs/ext3/xattr_trusted.c7
-rw-r--r--fs/ext3/xattr_user.c6
23 files changed, 2900 insertions, 1230 deletions
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe21218..8bbaf5bcf98 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -4,13 +4,7 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -54,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size)
case ACL_OTHER:
value = (char *)value +
sizeof(ext3_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
break;
case ACL_USER:
+ value = (char *)value + sizeof(ext3_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
case ACL_GROUP:
value = (char *)value + sizeof(ext3_acl_entry);
if ((char *)value > end)
goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
break;
default:
@@ -97,14 +98,19 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext3_acl_header);
for (n=0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext3_acl_entry *entry = (ext3_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch(acl->a_entries[n].e_tag) {
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(ext3_acl_entry);
+ break;
case ACL_GROUP:
- entry->e_id =
- cpu_to_le32(acl->a_entries[n].e_id);
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
e += sizeof(ext3_acl_entry);
break;
@@ -131,7 +137,7 @@ fail:
*
* inode->i_mutex: don't care
*/
-static struct posix_acl *
+struct posix_acl *
ext3_get_acl(struct inode *inode, int type)
{
int name_index;
@@ -139,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -184,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext3_new_inode
*/
static int
-ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
@@ -192,19 +191,14 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch(type) {
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
return error;
else {
- inode->i_mode = mode;
inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
if (error == 0)
@@ -240,19 +234,20 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return error;
- }
+ handle_t *handle;
+ int error, retries = 0;
- return -EAGAIN;
+retry:
+ handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ error = __ext3_set_acl(handle, inode, type, acl);
+ ext3_journal_stop(handle);
+ if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ return error;
}
/*
@@ -264,211 +259,23 @@ ext3_check_acl(struct inode *inode, int mask)
int
ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
+ struct posix_acl *default_acl, *acl;
+ int error;
- if (S_ISDIR(inode->i_mode)) {
- error = ext3_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_NOFS);
- error = -ENOMEM;
- if (!clone)
- goto cleanup;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
- mode = inode->i_mode;
- error = posix_acl_create_masq(clone, &mode);
- if (error >= 0) {
- inode->i_mode = mode;
- if (error > 0) {
- /* This is an extended ACL */
- error = ext3_set_acl(handle, inode,
- ACL_TYPE_ACCESS, clone);
- }
- }
- posix_acl_release(clone);
+ if (default_acl) {
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
}
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext3_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl, *clone;
- int error;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- error = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!error) {
- handle_t *handle;
- int retries = 0;
-
- retry:
- handle = ext3_journal_start(inode,
- EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext3_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
- ext3_journal_stop(handle);
- if (error == -ENOSPC &&
- ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
+ if (acl) {
+ if (!error)
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
}
-out:
- posix_acl_release(clone);
- return error;
-}
-
-/*
- * Extended attribute handlers
- */
-static size_t
-ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext3_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
return error;
}
-
-static int
-ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
-retry:
- handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- error = ext3_set_acl(handle, inode, type, acl);
- ext3_journal_stop(handle);
- if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
-release_and_out:
- posix_acl_release(acl);
- return error;
-}
-
-const struct xattr_handler ext3_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext3_xattr_list_acl_access,
- .get = ext3_xattr_get_acl,
- .set = ext3_xattr_set_acl,
-};
-
-const struct xattr_handler ext3_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext3_xattr_list_acl_default,
- .get = ext3_xattr_get_acl,
- .set = ext3_xattr_set_acl,
-};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de..ea1c69edab9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,19 +54,14 @@ static inline int ext3_acl_count(size_t size)
#ifdef CONFIG_EXT3_FS_POSIX_ACL
/* acl.c */
-extern int ext3_check_acl (struct inode *, int);
-extern int ext3_acl_chmod (struct inode *);
+extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext3_check_acl NULL
-
-static inline int
-ext3_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext3_get_acl NULL
+#define ext3_set_acl NULL
static inline int
ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db2264942..158b5d4ce06 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -11,15 +11,9 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/time.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include "ext3.h"
/*
* balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +33,21 @@
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+ ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+ blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+ if (offsetp)
+ *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+ if (blockgrpp)
+ *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
/**
* ext3_get_group_desc() -- load group descriptor from disk
* @sb: super block
@@ -145,6 +154,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
desc = ext3_get_group_desc(sb, block_group, NULL);
if (!desc)
return NULL;
+ trace_ext3_read_block_bitmap(sb, block_group);
bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
@@ -335,6 +345,7 @@ void ext3_rsv_window_add(struct super_block *sb,
struct rb_node * parent = NULL;
struct ext3_reserve_window_node *this;
+ trace_ext3_rsv_window_add(sb, rsv);
while (*p)
{
parent = *p;
@@ -408,7 +419,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
void ext3_init_block_alloc_info(struct inode *inode)
{
struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct ext3_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
@@ -460,8 +471,10 @@ void ext3_discard_reservation(struct inode *inode)
rsv = &block_i->rsv_window_node;
if (!rsv_is_empty(&rsv->rsv_window)) {
spin_lock(rsv_lock);
- if (!rsv_is_empty(&rsv->rsv_window))
+ if (!rsv_is_empty(&rsv->rsv_window)) {
+ trace_ext3_discard_reservation(inode, rsv);
rsv_window_remove(inode->i_sb, rsv);
+ }
spin_unlock(rsv_lock);
}
}
@@ -470,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode)
* ext3_free_blocks_sb() -- Free given blocks and update quota
* @handle: handle to this transaction
* @sb: super block
- * @block: start physcial block to free
+ * @block: start physical block to free
* @count: number of blocks to free
* @pdquot_freed_blocks: pointer to quota
*/
@@ -574,7 +587,7 @@ do_more:
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
BUFFER_TRACE(debug_bh,
- "No commited data in bitmap");
+ "No committed data in bitmap");
BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
__brelse(debug_bh);
}
@@ -667,14 +680,10 @@ error_return:
void ext3_free_blocks(handle_t *handle, struct inode *inode,
ext3_fsblk_t block, unsigned long count)
{
- struct super_block * sb;
+ struct super_block *sb = inode->i_sb;
unsigned long dquot_freed_blocks;
- sb = inode->i_sb;
- if (!sb) {
- printk ("ext3_free_blocks: nonexistent device");
- return;
- }
+ trace_ext3_free_blocks(inode, block, count);
ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
if (dquot_freed_blocks)
dquot_free_block(inode, dquot_freed_blocks);
@@ -1047,7 +1056,7 @@ static int find_next_reservable_window(
rsv_window_remove(sb, my_rsv);
/*
- * Let's book the whole avaliable window for now. We will check the
+ * Let's book the whole available window for now. We will check the
* disk bitmap later and then, if there are free blocks then we adjust
* the window size if it's larger than requested.
* Otherwise, we will remove this node from the tree next time
@@ -1120,6 +1129,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
else
start_block = grp_goal + group_first_block;
+ trace_ext3_alloc_new_reservation(sb, start_block);
size = my_rsv->rsv_goal_size;
if (!rsv_is_empty(&my_rsv->rsv_window)) {
@@ -1214,8 +1224,11 @@ retry:
* check if the first free block is within the
* free space we just reserved
*/
- if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+ if (start_block >= my_rsv->rsv_start &&
+ start_block <= my_rsv->rsv_end) {
+ trace_ext3_reserved(sb, start_block, my_rsv);
return 0; /* success */
+ }
/*
* if the first free bit we found is out of the reservable space
* continue search for next reservable space,
@@ -1419,15 +1432,16 @@ out:
*
* Check if filesystem has at least 1 free block available for allocation.
*/
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
{
ext3_fsblk_t free_blocks, root_blocks;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current_fsuid() &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
+ (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+ !in_group_p (sbi->s_resgid))) {
return 0;
}
return 1;
@@ -1440,14 +1454,14 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
*
* ext3_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
* return TRUE.
*
* if the total number of retries exceed three times, return FALSE.
*/
int ext3_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+ if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1498,10 +1512,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
*errp = -ENOSPC;
sb = inode->i_sb;
- if (!sb) {
- printk("ext3_new_block: nonexistent device");
- return 0;
- }
/*
* Check quota for allocation of this block.
@@ -1512,8 +1522,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
return 0;
}
+ trace_ext3_request_blocks(inode, goal, num);
+
sbi = EXT3_SB(sb);
- es = EXT3_SB(sb)->s_es;
+ es = sbi->s_es;
ext3_debug("goal=%lu.\n", goal);
/*
* Allocate a block from reservation only when
@@ -1527,7 +1539,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext3_has_free_blocks(sbi)) {
+ if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
*errp = -ENOSPC;
goto out;
}
@@ -1616,9 +1628,9 @@ retry_alloc:
goto allocated;
}
/*
- * We may end up a bogus ealier ENOSPC error due to
+ * We may end up a bogus earlier ENOSPC error due to
* filesystem is "full" of reservations, but
- * there maybe indeed free blocks avaliable on disk
+ * there maybe indeed free blocks available on disk
* In this case, we just forget about the reservations
* just do block allocation as without reservations.
*/
@@ -1715,17 +1727,21 @@ allocated:
percpu_counter_sub(&sbi->s_freeblocks_counter, num);
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
+ fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
if (fatal)
goto out;
*errp = 0;
brelse(bitmap_bh);
- dquot_free_block(inode, *count-num);
- *count = num;
+
+ if (num < *count) {
+ dquot_free_block(inode, *count-num);
+ *count = num;
+ }
+
+ trace_ext3_allocate_blocks(inode, goal, num,
+ (unsigned long long)ret_block);
+
return ret_block;
io_error:
@@ -1794,7 +1810,7 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
brelse(bitmap_bh);
printk("ext3_count_free_blocks: stored = "E3FSBLK
", computed = "E3FSBLK", "E3FSBLK"\n",
- le32_to_cpu(es->s_free_blocks_count),
+ (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
desc_count, bitmap_count);
return bitmap_count;
#else
@@ -1885,3 +1901,258 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
return ext3_bg_num_gdb_meta(sb,group);
}
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: allocation group to trim
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @gdp: allocation group description structure
+ * @minblocks: minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+ unsigned int group,
+ ext3_grpblk_t start, ext3_grpblk_t max,
+ ext3_grpblk_t minblocks)
+{
+ handle_t *handle;
+ ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+ ext3_fsblk_t discard_block;
+ struct ext3_sb_info *sbi;
+ struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+ struct ext3_group_desc *gdp;
+ int err = 0, ret = 0;
+
+ /*
+ * We will update one block bitmap, and one group descriptor
+ */
+ handle = ext3_journal_start_sb(sb, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ bitmap_bh = read_block_bitmap(sb, group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting undo access");
+ err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (err)
+ goto err_out;
+
+ gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto err_out;
+
+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+ sbi = EXT3_SB(sb);
+
+ /* Walk through the whole group */
+ while (start <= max) {
+ start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+ if (start < 0)
+ break;
+ next = start;
+
+ /*
+ * Allocate contiguous free extents by setting bits in the
+ * block bitmap
+ */
+ while (next <= max
+ && claim_block(sb_bgl_lock(sbi, group),
+ next, bitmap_bh)) {
+ next++;
+ }
+
+ /* We did not claim any blocks */
+ if (next == start)
+ continue;
+
+ discard_block = (ext3_fsblk_t)start +
+ ext3_group_first_block_no(sb, group);
+
+ /* Update counters */
+ spin_lock(sb_bgl_lock(sbi, group));
+ le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+ spin_unlock(sb_bgl_lock(sbi, group));
+ percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+ free_blocks -= next - start;
+ /* Do not issue a TRIM on extents smaller than minblocks */
+ if ((next - start) < minblocks)
+ goto free_extent;
+
+ trace_ext3_discard_blocks(sb, discard_block, next - start);
+ /* Send the TRIM command down to the device */
+ err = sb_issue_discard(sb, discard_block, next - start,
+ GFP_NOFS, 0);
+ count += (next - start);
+free_extent:
+ freed = 0;
+
+ /*
+ * Clear bits in the bitmap
+ */
+ for (bit = start; bit < next; bit++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+ bit, bitmap_bh->b_data)) {
+ ext3_error(sb, __func__,
+ "bit already cleared for block "E3FSBLK,
+ (unsigned long)bit);
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ freed++;
+ }
+ }
+
+ /* Update couters */
+ spin_lock(sb_bgl_lock(sbi, group));
+ le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+ spin_unlock(sb_bgl_lock(sbi, group));
+ percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+ start = next;
+ if (err < 0) {
+ if (err != -EOPNOTSUPP)
+ ext3_warning(sb, __func__, "Discard command "
+ "returned error %d\n", err);
+ break;
+ }
+
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+
+ /* No more suitable extents */
+ if (free_blocks < minblocks)
+ break;
+ }
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ if (!err)
+ err = ret;
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+ ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+ if (!err)
+ err = ret;
+
+ ext3_debug("trimmed %d blocks in the group %d\n",
+ count, group);
+
+err_out:
+ if (err)
+ count = err;
+ ext3_journal_stop(handle);
+ brelse(bitmap_bh);
+
+ return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb: superblock for filesystem
+ * @start: First Byte to trim
+ * @len: number of Bytes to trim from start
+ * @minlen: minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ ext3_grpblk_t last_block, first_block;
+ unsigned long group, first_group, last_group;
+ struct ext3_group_desc *gdp;
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ uint64_t start, minlen, end, trimmed = 0;
+ ext3_fsblk_t first_data_blk =
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+ ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+ int ret = 0;
+
+ start = range->start >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = range->minlen >> sb->s_blocksize_bits;
+
+ if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
+ return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
+
+ smp_rmb();
+
+ /* Determine first and last group to examine based on start and len */
+ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+ &first_group, &first_block);
+ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
+ &last_group, &last_block);
+
+ /* end now represents the last block to discard in this group */
+ end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
+
+ for (group = first_group; group <= last_group; group++) {
+ gdp = ext3_get_group_desc(sb, group, NULL);
+ if (!gdp)
+ break;
+
+ /*
+ * For all the groups except the last one, last block will
+ * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_block is
+ * already computed earlier by ext3_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_block;
+
+ if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
+ ret = ext3_trim_all_free(sb, group, first_block,
+ end, minlen);
+ if (ret < 0)
+ break;
+ trimmed += ret;
+ }
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first block to discard will be block #0.
+ */
+ first_block = 0;
+ }
+
+ if (ret > 0)
+ ret = 0;
+
+out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 6afc39d8025..ef9c643e8e9 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -7,25 +7,13 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#include <linux/buffer_head.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#ifdef EXT3FS_DEBUG
-static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-
unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
{
- unsigned int i;
- unsigned long sum = 0;
-
- if (!map)
- return (0);
- for (i = 0; i < numchars; i++)
- sum += nibblemap[map->b_data[i] & 0xf] +
- nibblemap[(map->b_data[i] >> 4) & 0xf];
- return (sum);
+ return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
}
#endif /* EXT3FS_DEBUG */
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf..17742eed2c1 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -21,35 +21,14 @@
*
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
+#include <linux/compat.h>
+#include "ext3.h"
static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext3_readdir(struct file *, void *, filldir_t);
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
-static int ext3_release_dir (struct inode * inode,
- struct file * filp);
-
-const struct file_operations ext3_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .readdir = ext3_readdir, /* we take BKL. needed?*/
- .unlocked_ioctl = ext3_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext3_compat_ioctl,
-#endif
- .fsync = ext3_sync_file, /* BKL held */
- .release = ext3_release_dir,
-};
-
+static int ext3_dx_readdir(struct file *, struct dir_context *);
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -60,6 +39,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
return (ext3_filetype_table[filetype]);
}
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+ ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ return 1;
+
+ return 0;
+}
int ext3_check_dir_entry (const char * function, struct inode * dir,
struct ext3_dir_entry_2 * de,
@@ -69,63 +67,53 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
const char * error_msg = NULL;
const int rlen = ext3_rec_len_from_disk(de->rec_len);
- if (rlen < EXT3_DIR_REC_LEN(1))
+ if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
error_msg = "directory entry across blocks";
- else if (le32_to_cpu(de->inode) >
- le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+ else if (unlikely(le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
- if (error_msg != NULL)
+ if (unlikely(error_msg != NULL))
ext3_error (dir->i_sb, function,
"bad entry in directory #%lu: %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
dir->i_ino, error_msg, offset,
(unsigned long) le32_to_cpu(de->inode),
rlen, de->name_len);
+
return error_msg == NULL ? 1 : 0;
}
-static int ext3_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
{
- int error = 0;
unsigned long offset;
- int i, stored;
+ int i;
struct ext3_dir_entry_2 *de;
- struct super_block *sb;
int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
- int ret = 0;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
int dir_has_error = 0;
- sb = inode->i_sb;
-
- if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT3_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
- err = ext3_dx_readdir(filp, dirent, filldir);
- if (err != ERR_BAD_DX_DIR) {
- ret = err;
- goto out;
- }
+ if (is_dx_dir(inode)) {
+ err = ext3_dx_readdir(file, ctx);
+ if (err != ERR_BAD_DX_DIR)
+ return err;
/*
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
}
- stored = 0;
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < inode->i_size) {
- unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+ while (ctx->pos < inode->i_size) {
+ unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
struct buffer_head map_bh;
struct buffer_head *bh = NULL;
@@ -134,12 +122,12 @@ static int ext3_readdir(struct file * filp,
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&filp->f_ra, index))
+ if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
- &filp->f_ra, filp,
+ &file->f_ra, file,
index, 1);
- filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext3_bread(NULL, inode, blk, 0, &err);
}
@@ -151,22 +139,21 @@ static int ext3_readdir(struct file * filp,
if (!dir_has_error) {
ext3_error(sb, __func__, "directory #%lu "
"contains a hole at offset %lld",
- inode->i_ino, filp->f_pos);
+ inode->i_ino, ctx->pos);
dir_has_error = 1;
}
/* corrupt size? Maybe no more blocks to read */
- if (filp->f_pos > inode->i_blocks << 9)
+ if (ctx->pos > inode->i_blocks << 9)
break;
- filp->f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (offset && file->f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext3_dir_entry_2 *)
(bh->b_data + i);
@@ -182,71 +169,124 @@ revalidate:
i += ext3_rec_len_from_disk(de->rec_len);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
}
- while (!error && filp->f_pos < inode->i_size
+ while (ctx->pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
bh, offset)) {
- /* On error, skip the f_pos to the
+ /* On error, skip the to the
next block. */
- filp->f_pos = (filp->f_pos |
+ ctx->pos = (ctx->pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
- ret = stored;
- goto out;
+ break;
}
offset += ext3_rec_len_from_disk(de->rec_len);
if (le32_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = filp->f_version;
-
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type))) {
+ brelse(bh);
+ return 0;
+ }
}
- filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+ ctx->pos += ext3_rec_len_from_disk(de->rec_len);
}
offset = 0;
brelse (bh);
+ if (ctx->pos < inode->i_size)
+ if (!dir_relax(inode))
+ return 0;
}
-out:
- return ret;
+ return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
}
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext3_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT3_HTREE_EOF_32BIT;
+ else
+ return EXT3_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond s_maxbytes,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
+ */
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int dx_dir = is_dx_dir(inode);
+ loff_t htree_max = ext3_get_htree_eof(file);
+
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, whence,
+ htree_max, htree_max);
+ else
+ return generic_file_llseek(file, offset, whence);
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -269,53 +309,28 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
- while (fname) {
- struct fname * old = fname;
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+ do {
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
- }
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
-}
+ kfree(old);
+ } while (fname);
+ *root = RB_ROOT;
+}
-static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -390,62 +405,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
- filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+ struct fname *fname)
{
- struct dir_private_info *info = filp->private_data;
- loff_t curr_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
- struct super_block * sb;
- int error;
-
- sb = inode->i_sb;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
if (!fname) {
printk("call_filldir: called with null fname?!?\n");
- return 0;
+ return true;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
- error = filldir(dirent, fname->name,
- fname->name_len, curr_pos,
+ if (!dir_emit(ctx, fname->name, fname->name_len,
fname->inode,
- get_dtype(sb, fname->file_type));
- if (error) {
- filp->f_pos = curr_pos;
+ get_dtype(sb, fname->file_type))) {
info->extra_fname = fname;
- return error;
+ return false;
}
fname = fname->next;
}
- return 0;
+ return true;
}
-static int ext3_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
{
- struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct dir_private_info *info = file->private_data;
+ struct inode *inode = file_inode(file);
struct fname *fname;
int ret;
if (!info) {
- info = ext3_htree_create_dir_info(filp->f_pos);
+ info = ext3_htree_create_dir_info(file, ctx->pos);
if (!info)
return -ENOMEM;
- filp->private_data = info;
+ file->private_data = info;
}
- if (filp->f_pos == EXT3_HTREE_EOF)
+ if (ctx->pos == ext3_get_htree_eof(file))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
- if (info->last_pos != filp->f_pos) {
+ if (info->last_pos != ctx->pos) {
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(file, ctx->pos);
+ info->curr_minor_hash = pos2min_hash(file, ctx->pos);
}
/*
@@ -453,7 +460,7 @@ static int ext3_dx_readdir(struct file * filp,
* chain, return them first.
*/
if (info->extra_fname) {
- if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ if (!call_filldir(file, ctx, info->extra_fname))
goto finished;
info->extra_fname = NULL;
goto next_node;
@@ -467,17 +474,17 @@ static int ext3_dx_readdir(struct file * filp,
* cached entries.
*/
if ((!info->curr_node) ||
- (filp->f_version != inode->i_version)) {
+ (file->f_version != inode->i_version)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- filp->f_version = inode->i_version;
- ret = ext3_htree_fill_tree(filp, info->curr_hash,
+ file->f_version = inode->i_version;
+ ret = ext3_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_node = rb_first(&info->root);
@@ -486,7 +493,7 @@ static int ext3_dx_readdir(struct file * filp,
fname = rb_entry(info->curr_node, struct fname, rb_hash);
info->curr_hash = fname->hash;
info->curr_minor_hash = fname->minor_hash;
- if (call_filldir(filp, dirent, filldir, fname))
+ if (!call_filldir(file, ctx, fname))
break;
next_node:
info->curr_node = rb_next(info->curr_node);
@@ -497,7 +504,7 @@ static int ext3_dx_readdir(struct file * filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ ctx->pos = ext3_get_htree_eof(file);
break;
}
info->curr_hash = info->next_hash;
@@ -505,7 +512,7 @@ static int ext3_dx_readdir(struct file * filp,
}
}
finished:
- info->last_pos = filp->f_pos;
+ info->last_pos = ctx->pos;
return 0;
}
@@ -516,3 +523,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
return 0;
}
+
+const struct file_operations ext3_dir_operations = {
+ .llseek = ext3_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = ext3_readdir,
+ .unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext3_compat_ioctl,
+#endif
+ .fsync = ext3_sync_file,
+ .release = ext3_release_dir,
+};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
new file mode 100644
index 00000000000..e85ff15a060
--- /dev/null
+++ b/fs/ext3/ext3.h
@@ -0,0 +1,1326 @@
+/*
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/magic.h>
+#include <linux/bug.h>
+#include <linux/blockgroup_lock.h>
+
+/*
+ * The second extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT3FS_DEBUG to produce debug messages
+ */
+#undef EXT3FS_DEBUG
+
+/*
+ * Define EXT3_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT3_DEFAULT_RESERVE_BLOCKS 8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT3_MAX_RESERVE_BLOCKS 1027
+#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
+
+/*
+ * Debug code
+ */
+#ifdef EXT3FS_DEBUG
+#define ext3_debug(f, a...) \
+ do { \
+ printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk (KERN_DEBUG f, ## a); \
+ } while (0)
+#else
+#define ext3_debug(f, a...) do {} while (0)
+#endif
+
+/*
+ * Special inodes numbers
+ */
+#define EXT3_BAD_INO 1 /* Bad blocks inode */
+#define EXT3_ROOT_INO 2 /* Root inode */
+#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+#define EXT3_JOURNAL_INO 8 /* Journal inode */
+
+/* First non-reserved inode for old ext3 filesystems */
+#define EXT3_GOOD_OLD_FIRST_INO 11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT3_LINK_MAX 32000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT3_MIN_BLOCK_SIZE 1024
+#define EXT3_MAX_BLOCK_SIZE 65536
+#define EXT3_MIN_BLOCK_LOG_SIZE 10
+#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
+#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
+#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
+#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
+
+/*
+ * Macro-instructions used to manage fragments
+ */
+#define EXT3_MIN_FRAG_SIZE 1024
+#define EXT3_MAX_FRAG_SIZE 4096
+#define EXT3_MIN_FRAG_LOG_SIZE 10
+#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
+#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext3_group_desc
+{
+ __le32 bg_block_bitmap; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap; /* Inodes bitmap block */
+ __le32 bg_inode_table; /* Inodes table block */
+ __le16 bg_free_blocks_count; /* Free blocks count */
+ __le16 bg_free_inodes_count; /* Free inodes count */
+ __le16 bg_used_dirs_count; /* Directories count */
+ __u16 bg_pad;
+ __le32 bg_reserved[3];
+};
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
+#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
+#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
+#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT3_NDIR_BLOCKS 12
+#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
+#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
+#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
+#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
+#define EXT3_UNRM_FL 0x00000002 /* Undelete */
+#define EXT3_COMPR_FL 0x00000004 /* Compress file */
+#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
+#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
+#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
+#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT3_DIRTY_FL 0x00000100
+#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
+#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+ EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+ EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
+ EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT3_REG_FLMASK;
+ else
+ return flags & EXT3_OTHER_FLMASK;
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext3_new_group_input {
+ __u32 group; /* Group number for this data */
+ __u32 block_bitmap; /* Absolute block number of block bitmap */
+ __u32 inode_bitmap; /* Absolute block number of inode bitmap */
+ __u32 inode_table; /* Absolute block number of inode table start */
+ __u32 blocks_count; /* Total number of blocks in this group */
+ __u16 reserved_blocks; /* Number of reserved blocks in this group */
+ __u16 unused;
+};
+
+/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
+struct ext3_new_group_data {
+ __u32 group;
+ __u32 block_bitmap;
+ __u32 inode_bitmap;
+ __u32 inode_table;
+ __u32 blocks_count;
+ __u16 reserved_blocks;
+ __u16 unused;
+ __u32 free_blocks_count;
+};
+
+
+/*
+ * ioctl commands
+ */
+#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
+#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
+#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+#endif
+#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
+#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
+#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
+#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
+#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#ifdef CONFIG_JBD_DEBUG
+#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
+#endif
+#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+
+
+/*
+ * Mount options
+ */
+struct ext3_mount_options {
+ unsigned long s_mount_opt;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext3_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __u32 l_i_reserved1;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl; /* File ACL */
+ __le32 i_dir_acl; /* Directory ACL */
+ __le32 i_faddr; /* Fragment address */
+ union {
+ struct {
+ __u8 l_i_frag; /* Fragment number */
+ __u8 l_i_fsize; /* Fragment size */
+ __u16 i_pad1;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __u8 h_i_frag; /* Fragment number */
+ __u8 h_i_fsize; /* Fragment size */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __u8 m_i_frag; /* Fragment number */
+ __u8 m_i_fsize; /* Fragment size */
+ __u16 m_pad1;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ __le16 i_extra_isize;
+ __le16 i_pad1;
+};
+
+#define i_size_high i_dir_acl
+
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_frag osd2.linux2.l_i_frag
+#define i_fsize osd2.linux2.l_i_fsize
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_reserved2 osd2.linux2.l_i_reserved2
+
+/*
+ * File system states
+ */
+#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT3_ERROR_FS 0x0002 /* Errors detected */
+#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
+/* EXT3_MOUNT_OLDALLOC was there */
+#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
+#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
+#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
+#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
+#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
+#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
+#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
+#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
+#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
+#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
+ * error in ordered mode */
+
+/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+#ifndef _LINUX_EXT2_FS_H
+#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
+#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
+#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
+ EXT3_MOUNT_##opt)
+#else
+#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
+#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
+#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
+#endif
+
+#define ext3_set_bit __set_bit_le
+#define ext3_set_bit_atomic ext2_set_bit_atomic
+#define ext3_clear_bit __clear_bit_le
+#define ext3_clear_bit_atomic ext2_clear_bit_atomic
+#define ext3_test_bit test_bit_le
+#define ext3_find_next_zero_bit find_next_zero_bit_le
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT3_ERRORS_PANIC 3 /* Panic */
+#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext3_super_block {
+/*00*/ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count; /* Blocks count */
+ __le32 s_r_blocks_count; /* Reserved blocks count */
+ __le32 s_free_blocks_count; /* Free blocks count */
+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_frag_size; /* Fragment size */
+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_frags_per_group; /* # Fragments per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+/*30*/ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+/*40*/ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT3_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+/*78*/ char s_volume_name[16]; /* volume name */
+/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
+ /*
+ * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
+ __le32 s_journal_dev; /* device number of journal file */
+ __le32 s_last_orphan; /* start of list of inodes to delete */
+ __le32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_word_pad;
+ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __le32 s_mkfs_time; /* When the filesystem was created */
+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
+ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
+ __le32 s_r_blocks_count_hi; /* Reserved blocks count */
+ __le32 s_free_blocks_count_hi; /* Free blocks count */
+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+ __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
+};
+
+/* data type for block offset of block group */
+typedef int ext3_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long ext3_fsblk_t;
+
+#define E3FSBLK "%lu"
+
+struct ext3_reserve_window {
+ ext3_fsblk_t _rsv_start; /* First byte reserved */
+ ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
+};
+
+struct ext3_reserve_window_node {
+ struct rb_node rsv_node;
+ __u32 rsv_goal_size;
+ __u32 rsv_alloc_hit;
+ struct ext3_reserve_window rsv_window;
+};
+
+struct ext3_block_alloc_info {
+ /* information about reservation window */
+ struct ext3_reserve_window_node rsv_window_node;
+ /*
+ * was i_next_alloc_block in ext3_inode_info
+ * is the logical (file-relative) number of the
+ * most-recently-allocated block in this file.
+ * We use this for detecting linearly ascending allocation requests.
+ */
+ __u32 last_alloc_logical_block;
+ /*
+ * Was i_next_alloc_goal in ext3_inode_info
+ * is the *physical* companion to i_next_alloc_block.
+ * it the physical block number of the block which was most-recentl
+ * allocated to this file. This give us the goal (target) for the next
+ * allocation when we detect linearly ascending requests.
+ */
+ ext3_fsblk_t last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * third extended file system inode data in memory
+ */
+struct ext3_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_flags;
+#ifdef EXT3_FRAGMENTS
+ __u32 i_faddr;
+ __u8 i_frag_no;
+ __u8 i_frag_size;
+#endif
+ ext3_fsblk_t i_file_acl;
+ __u32 i_dir_acl;
+ __u32 i_dtime;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ __u32 i_block_group;
+ unsigned long i_state_flags; /* Dynamic state flags for ext3 */
+
+ /* block reservation info */
+ struct ext3_block_alloc_info *i_block_alloc_info;
+
+ __u32 i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+#endif
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext3_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+ /*
+ * truncate_mutex is for serialising ext3_truncate() against
+ * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext3 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have truncate_mutex.
+ */
+ struct mutex truncate_mutex;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ atomic_t i_sync_tid;
+ atomic_t i_datasync_tid;
+
+ struct inode vfs_inode;
+};
+
+/*
+ * third extended-fs super-block data in memory
+ */
+struct ext3_sb_info {
+ unsigned long s_frag_size; /* Size of a fragment in bytes */
+ unsigned long s_frags_per_block;/* Number of fragments per block */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_frags_per_group;/* Number of fragments in a group */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ unsigned long s_groups_count; /* Number of groups in the fs */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head ** s_group_desc;
+ unsigned long s_mount_opt;
+ ext3_fsblk_t s_sb_block;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+
+ /* root of the per fs reservation window tree */
+ spinlock_t s_rsv_window_lock;
+ struct rb_root s_rsv_window_root;
+ struct ext3_reserve_window_node s_rsv_window_head;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+ struct journal_s * s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ struct mutex s_resize_lock;
+ unsigned long s_commit_interval;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+};
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
+
+static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
+{
+ return container_of(inode, struct ext3_inode_info, vfs_inode);
+}
+
+static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
+{
+ return ino == EXT3_ROOT_INO ||
+ ino == EXT3_JOURNAL_INO ||
+ ino == EXT3_RESIZE_INO ||
+ (ino >= EXT3_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT3_STATE_JDATA, /* journaled data exists */
+ EXT3_STATE_NEW, /* inode is newly created */
+ EXT3_STATE_XATTR, /* has in-inode xattrs */
+ EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
+};
+
+static inline int ext3_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+static inline void ext3_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT3_I(inode)->i_state_flags);
+}
+
+#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT3_OS_LINUX 0
+#define EXT3_OS_HURD 1
+#define EXT3_OS_MASIX 2
+#define EXT3_OS_FREEBSD 3
+#define EXT3_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
+#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
+
+#define EXT3_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+
+#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+ EXT3_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT3_DEF_RESUID 0
+#define EXT3_DEF_RESGID 0
+
+/*
+ * Default mount options
+ */
+#define EXT3_DEFM_DEBUG 0x0001
+#define EXT3_DEFM_BSDGROUPS 0x0002
+#define EXT3_DEFM_XATTR_USER 0x0004
+#define EXT3_DEFM_ACL 0x0008
+#define EXT3_DEFM_UID16 0x0010
+#define EXT3_DEFM_JMODE 0x0060
+#define EXT3_DEFM_JMODE_DATA 0x0020
+#define EXT3_DEFM_JMODE_ORDERED 0x0040
+#define EXT3_DEFM_JMODE_WBACK 0x0060
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT3_NAME_LEN 255
+
+struct ext3_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * The new version of the directory entry. Since EXT3 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext3_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT3_NAME_LEN]; /* File name */
+};
+
+/*
+ * Ext3 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+#define EXT3_FT_UNKNOWN 0
+#define EXT3_FT_REG_FILE 1
+#define EXT3_FT_DIR 2
+#define EXT3_FT_CHRDEV 3
+#define EXT3_FT_BLKDEV 4
+#define EXT3_FT_FIFO 5
+#define EXT3_FT_SOCK 6
+#define EXT3_FT_SYMLINK 7
+
+#define EXT3_FT_MAX 8
+
+/*
+ * EXT3_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT3_DIR_PAD 4
+#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
+#define EXT3_MAX_REC_LEN ((1<<16)-1)
+
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
+static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == EXT3_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 ext3_rec_len_to_disk(unsigned len)
+{
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(EXT3_MAX_REC_LEN);
+ else if (len > (1 << 16))
+ BUG();
+#endif
+ return cpu_to_le16(len);
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY 0
+#define DX_HASH_HALF_MD4 1
+#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+ u32 hash;
+ u32 minor_hash;
+ int hash_version;
+ u32 *seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext3_htree_next_block
+ */
+#define HASH_NB_ALWAYS 1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext3_iloc
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ unsigned long block_group;
+};
+
+static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
+{
+ return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories. It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+ struct rb_root root;
+ struct rb_node *curr_node;
+ struct fname *extra_fname;
+ loff_t last_pos;
+ __u32 curr_hash;
+ __u32 curr_minor_hash;
+ __u32 next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext3_fsblk_t
+ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+ return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR -75000
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext3 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE /**/
+# define ATTRIB_NORET __attribute__((noreturn))
+# define NORET_AND noreturn,
+
+/* balloc.c */
+extern int ext3_bg_has_super(struct super_block *sb, int group);
+extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, int *errp);
+extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
+ ext3_fsblk_t block, unsigned long count);
+extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
+ ext3_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks);
+extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
+extern void ext3_check_blocks_bitmap (struct super_block *);
+extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh);
+extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext3_init_block_alloc_info(struct inode *);
+extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
+
+/* dir.c */
+extern int ext3_check_dir_entry(const char *, struct inode *,
+ struct ext3_dir_entry_2 *,
+ struct buffer_head *, unsigned long);
+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext3_dir_entry_2 *dirent);
+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext3fs_dirhash(const char *name, int len, struct
+ dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode * ext3_new_inode (handle_t *, struct inode *,
+ const struct qstr *, umode_t);
+extern void ext3_free_inode (handle_t *, struct inode *);
+extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+extern unsigned long ext3_count_free_inodes (struct super_block *);
+extern unsigned long ext3_count_dirs (struct super_block *);
+extern void ext3_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+
+/* inode.c */
+int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext3_fsblk_t blocknr);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+ sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
+ int create);
+
+extern struct inode *ext3_iget(struct super_block *, unsigned long);
+extern int ext3_write_inode (struct inode *, struct writeback_control *);
+extern int ext3_setattr (struct dentry *, struct iattr *);
+extern void ext3_evict_inode (struct inode *);
+extern int ext3_sync_inode (handle_t *, struct inode *);
+extern void ext3_discard_reservation (struct inode *);
+extern void ext3_dirty_inode(struct inode *, int);
+extern int ext3_change_inode_journal_flag(struct inode *, int);
+extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
+extern int ext3_can_truncate(struct inode *inode);
+extern void ext3_truncate(struct inode *inode);
+extern void ext3_set_inode_flags(struct inode *);
+extern void ext3_get_inode_flags(struct ext3_inode_info *);
+extern void ext3_set_aops(struct inode *inode);
+extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+/* ioctl.c */
+extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* namei.c */
+extern int ext3_orphan_add(handle_t *, struct inode *);
+extern int ext3_orphan_del(handle_t *, struct inode *);
+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext3_group_add(struct super_block *sb,
+ struct ext3_new_group_data *input);
+extern int ext3_group_extend(struct super_block *sb,
+ struct ext3_super_block *es,
+ ext3_fsblk_t n_blocks_count);
+
+/* super.c */
+extern __printf(3, 4)
+void ext3_error(struct super_block *, const char *, const char *, ...);
+extern void __ext3_std_error (struct super_block *, const char *, int);
+extern __printf(3, 4)
+void ext3_abort(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_warning(struct super_block *, const char *, const char *, ...);
+extern __printf(3, 4)
+void ext3_msg(struct super_block *, const char *, const char *, ...);
+extern void ext3_update_dynamic_rev (struct super_block *sb);
+
+#define ext3_std_error(sb, errno) \
+do { \
+ if ((errno)) \
+ __ext3_std_error((sb), __func__, (errno)); \
+} while (0)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext3_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext3_file_inode_operations;
+extern const struct file_operations ext3_file_operations;
+
+/* namei.c */
+extern const struct inode_operations ext3_dir_inode_operations;
+extern const struct inode_operations ext3_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext3_symlink_inode_operations;
+extern const struct inode_operations ext3_fast_symlink_inode_operations;
+
+#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction. */
+
+#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT3_XATTR_TRANS_BLOCKS 6U
+
+/* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
+ EXT3_XATTR_TRANS_BLOCKS - 2 + \
+ EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+ * generous. We can grow the delete transaction later if necessary. */
+
+#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT3_MAX_TRANS_DATA 64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+#define EXT3_RESERVE_TRANS_BLOCKS 12U
+
+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only inode+data */
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+ (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+
+int
+ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext3_iloc *iloc);
+
+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext3 calls into JBD. The intent here is
+ * to allow these to be turned into appropriate stubs so ext3 can control
+ * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
+ * been done yet.
+ */
+
+static inline void ext3_journal_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ journal_release_buffer(handle, bh);
+}
+
+void ext3_journal_abort_handle(const char *caller, const char *err_fn,
+ struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_get_write_access(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh);
+
+int __ext3_journal_revoke(const char *where, handle_t *handle,
+ unsigned long blocknr, struct buffer_head *bh);
+
+int __ext3_journal_get_create_access(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext3_journal_dirty_metadata(const char *where,
+ handle_t *handle, struct buffer_head *bh);
+
+#define ext3_journal_get_undo_access(handle, bh) \
+ __ext3_journal_get_undo_access(__func__, (handle), (bh))
+#define ext3_journal_get_write_access(handle, bh) \
+ __ext3_journal_get_write_access(__func__, (handle), (bh))
+#define ext3_journal_revoke(handle, blocknr, bh) \
+ __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext3_journal_get_create_access(handle, bh) \
+ __ext3_journal_get_create_access(__func__, (handle), (bh))
+#define ext3_journal_dirty_metadata(handle, bh) \
+ __ext3_journal_dirty_metadata(__func__, (handle), (bh))
+#define ext3_journal_forget(handle, bh) \
+ __ext3_journal_forget(__func__, (handle), (bh))
+
+int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+
+handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext3_journal_stop(const char *where, handle_t *handle);
+
+static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+{
+ return ext3_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext3_journal_stop(handle) \
+ __ext3_journal_stop(__func__, (handle))
+
+static inline handle_t *ext3_journal_current_handle(void)
+{
+ return journal_current_handle();
+}
+
+static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+{
+ return journal_extend(handle, nblocks);
+}
+
+static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+{
+ return journal_restart(handle, nblocks);
+}
+
+static inline int ext3_journal_blocks_per_page(struct inode *inode)
+{
+ return journal_blocks_per_page(inode);
+}
+
+static inline int ext3_journal_force_commit(journal_t *journal)
+{
+ return journal_force_commit(journal);
+}
+
+/* super.c */
+int ext3_force_commit(struct super_block *sb);
+
+static inline int ext3_should_journal_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 1;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ return 1;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_order_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
+ return 1;
+ return 0;
+}
+
+static inline int ext3_should_writeback_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
+ return 1;
+ return 0;
+}
+
+#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
index d401f148d74..785a3261a26 100644
--- a/fs/ext3/ext3_jbd.c
+++ b/fs/ext3/ext3_jbd.c
@@ -2,7 +2,7 @@
* Interface between ext3 and JBD
*/
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index f55df0e61cb..a062fa1e1b1 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -18,12 +18,8 @@
* (jj@sunsite.ms.mff.cuni.cz)
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
#include <linux/quotaops.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -54,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -67,11 +63,10 @@ const struct file_operations ext3_file_operations = {
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations ext3_file_inode_operations = {
- .truncate = ext3_truncate,
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
@@ -79,7 +74,8 @@ const struct inode_operations ext3_file_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .check_acl = ext3_check_acl,
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
.fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 09b13bb34c9..1cb9c7e10c6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -22,14 +22,9 @@
* we can depend on generic_block_fdatasync() to sync the data blocks.
*/
-#include <linux/time.h>
#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
#include <linux/writeback.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
+#include "ext3.h"
/*
* akpm: A new design for ext3_sync_file().
@@ -43,7 +38,7 @@
* inode to disk.
*/
-int ext3_sync_file(struct file *file, int datasync)
+int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ext3_inode_info *ei = EXT3_I(inode);
@@ -51,8 +46,18 @@ int ext3_sync_file(struct file *file, int datasync)
int ret, needs_barrier = 0;
tid_t commit_tid;
- if (inode->i_sb->s_flags & MS_RDONLY)
+ trace_ext3_sync_file_enter(file, datasync);
+
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated state */
+ smp_rmb();
+ if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+ return -EROFS;
return 0;
+ }
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out;
J_ASSERT(ext3_journal_current_handle() == NULL);
@@ -70,8 +75,10 @@ int ext3_sync_file(struct file *file, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext3_should_journal_data(inode))
- return ext3_force_commit(inode->i_sb);
+ if (ext3_should_journal_data(inode)) {
+ ret = ext3_force_commit(inode->i_sb);
+ goto out;
+ }
if (datasync)
commit_tid = atomic_read(&ei->i_datasync_tid);
@@ -89,7 +96,14 @@ int ext3_sync_file(struct file *file, int datasync)
* disk caches manually so that data really is on persistent
* storage
*/
- if (needs_barrier)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (needs_barrier) {
+ int err;
+
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
+out:
+ trace_ext3_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index 7d215b4d4f2..ede315cdf12 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -9,9 +9,7 @@
* License.
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/cryptohash.h>
#define DELTA 0x9E3779B9
@@ -200,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT3_HTREE_EOF << 1))
- hash = (EXT3_HTREE_EOF-1) << 1;
+ if (hash == (EXT3_HTREE_EOF_32BIT << 1))
+ hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef2246..a1b810230cc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -12,20 +12,10 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
#include <linux/random.h>
-#include <linux/bitops.h>
-
-#include <asm/byteorder.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -118,6 +108,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
ino = inode->i_ino;
ext3_debug ("freeing inode %lu\n", ino);
+ trace_ext3_free_inode(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -176,42 +167,6 @@ error_return:
}
/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
-{
- int ngroups = EXT3_SB(sb)->s_groups_count;
- unsigned int freei, avefreei;
- struct ext3_group_desc *desc, *best_desc = NULL;
- int group, best_group = -1;
-
- freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
- avefreei = freei / ngroups;
-
- for (group = 0; group < ngroups; group++) {
- desc = ext3_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
- continue;
- if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
- best_group = group;
- best_desc = desc;
- }
- }
- return best_group;
-}
-
-/*
* Orlov's allocator for directories.
*
* We always try to spread first-level directories.
@@ -225,8 +180,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
+ * it has too few free blocks left (min_blocks).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -236,21 +190,16 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
* when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent)
{
int parent_group = EXT3_I(parent)->i_block_group;
struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext3_fsblk_t freeb, avefreeb;
- ext3_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext3_grpblk_t min_blocks;
int group = -1, i;
struct ext3_group_desc *desc;
@@ -266,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
@@ -287,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
goto fallback;
}
- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
- max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
-
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext3_get_group_desc (sb, group, NULL);
@@ -404,7 +343,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+ const struct qstr *qstr, umode_t mode)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
@@ -425,6 +365,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
+ trace_ext3_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -432,12 +373,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT3_SB(sb);
es = sbi->s_es;
- if (S_ISDIR(mode)) {
- if (test_opt (sb, OLDALLOC))
- group = find_group_dir(sb, dir);
- else
- group = find_group_orlov(sb, dir);
- } else
+ if (S_ISDIR(mode))
+ group = find_group_orlov(sb, dir);
+ else
group = find_group_other(sb, dir);
err = -ENOSPC;
@@ -560,8 +498,12 @@ got:
if (IS_DIRSYNC(inode))
handle->h_sync = 1;
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
@@ -589,7 +531,7 @@ got:
if (err)
goto fail_free_drop;
- err = ext3_init_security(handle,inode, dir);
+ err = ext3_init_security(handle, inode, dir, qstr);
if (err)
goto fail_free_drop;
@@ -600,6 +542,7 @@ got:
}
ext3_debug("allocating inode %lu\n", inode->i_ino);
+ trace_ext3_allocate_inode(inode, dir, mode);
goto really_out;
fail:
ext3_std_error(sb, err);
@@ -616,7 +559,7 @@ fail_free_drop:
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
brelse(bitmap_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd..2c6ccc49ba2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,26 +22,18 @@
* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/ext3_jbd.h>
-#include <linux/jbd.h>
#include <linux/highuid.h>
-#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
-#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/aio.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
/*
* Test whether an inode is a fast symlink.
@@ -70,6 +62,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
might_sleep();
+ trace_ext3_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -194,20 +187,52 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
*/
void ext3_evict_inode (struct inode *inode)
{
+ struct ext3_inode_info *ei = EXT3_I(inode);
struct ext3_block_alloc_info *rsv;
handle_t *handle;
int want_delete = 0;
+ trace_ext3_evict_inode(inode);
if (!inode->i_nlink && !is_bad_inode(inode)) {
dquot_initialize(inode);
want_delete = 1;
}
- truncate_inode_pages(&inode->i_data, 0);
+ /*
+ * When journalling data dirty buffers are tracked only in the journal.
+ * So although mm thinks everything is clean and ready for reaping the
+ * inode might still have some pages to write in the running
+ * transaction or waiting to be checkpointed. Thus calling
+ * journal_invalidatepage() (via truncate_inode_pages()) to discard
+ * these buffers can cause data loss. Also even if we did not discard
+ * these buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to read
+ * them before the transaction is checkpointed. So be careful and
+ * force everything to disk here... We use ei->i_datasync_tid to
+ * store the newest transaction containing inode's data.
+ *
+ * Note that directories do not have this problem because they don't
+ * use page cache.
+ *
+ * The s_journal check handles the case when ext3_get_journal() fails
+ * and puts the journal inode.
+ */
+ if (inode->i_nlink && ext3_should_journal_data(inode) &&
+ EXT3_SB(inode->i_sb)->s_journal &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT3_JOURNAL_INO) {
+ tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+ log_start_commit(journal, commit_tid);
+ log_wait_commit(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
ext3_discard_reservation(inode);
- rsv = EXT3_I(inode)->i_block_alloc_info;
- EXT3_I(inode)->i_block_alloc_info = NULL;
+ rsv = ei->i_block_alloc_info;
+ ei->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
@@ -231,15 +256,13 @@ void ext3_evict_inode (struct inode *inode)
if (inode->i_blocks)
ext3_truncate(inode);
/*
- * Kill off the orphan record which ext3_truncate created.
- * AKPM: I think this can be inside the above `if'.
- * Note that ext3_orphan_del() has to be able to cope with the
- * deletion of a non-existent orphan - this is because we don't
- * know if ext3_truncate() actually created an orphan record.
- * (Well, we could do this if we need to, but heck - it works)
+ * Kill off the orphan record created when the inode lost the last
+ * link. Note that ext3_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - ext3_truncate() could
+ * have removed the record.
*/
ext3_orphan_del(handle, inode);
- EXT3_I(inode)->i_dtime = get_seconds();
+ ei->i_dtime = get_seconds();
/*
* One subtle ordering requirement: if anything has gone wrong
@@ -251,18 +274,18 @@ void ext3_evict_inode (struct inode *inode)
if (ext3_mark_inode_dirty(handle, inode)) {
/* If that failed, just dquot_drop() and be done with that */
dquot_drop(inode);
- end_writeback(inode);
+ clear_inode(inode);
} else {
ext3_xattr_delete_inode(handle, inode);
dquot_free_inode(inode);
dquot_drop(inode);
- end_writeback(inode);
+ clear_inode(inode);
ext3_free_inode(handle, inode);
}
ext3_journal_stop(handle);
return;
no_delete:
- end_writeback(inode);
+ clear_inode(inode);
dquot_drop(inode);
}
@@ -655,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -696,7 +723,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext3_free_blocks(handle, inode, new_blocks[i], 1);
ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -725,6 +752,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
struct ext3_block_alloc_info *block_i;
ext3_fsblk_t current_block;
struct ext3_inode_info *ei = EXT3_I(inode);
+ struct timespec now;
block_i = ei->i_block_alloc_info;
/*
@@ -764,9 +792,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
}
/* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
+ now = CURRENT_TIME_SEC;
+ if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
+ inode->i_ctime = now;
+ ext3_mark_inode_dirty(handle, inode);
+ }
/* ext3_mark_inode_dirty already updated i_sync_tid */
atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
@@ -842,6 +872,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
ext3_fsblk_t first_block = 0;
+ trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
J_ASSERT(handle != NULL || create == 0);
depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
@@ -886,6 +917,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (!create || err == -EIO)
goto cleanup;
+ /*
+ * Block out ext3_truncate while we alter the tree
+ */
mutex_lock(&ei->truncate_mutex);
/*
@@ -934,9 +968,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
*/
count = ext3_blks_to_allocate(partial, indirect_blks,
maxblocks, blocks_to_boundary);
- /*
- * Block out ext3_truncate while we alter the tree
- */
err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
offsets + (partial - chain), partial);
@@ -970,6 +1001,9 @@ cleanup:
}
BUFFER_TRACE(bh_result, "returned");
out:
+ trace_ext3_get_blocks_exit(inode, iblock,
+ depth ? le32_to_cpu(chain[depth-1].key) : 0,
+ count, err);
return err;
}
@@ -1043,16 +1077,15 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
* mapped. 0 in case of a HOLE.
*/
if (err > 0) {
- if (err > 1)
- WARN_ON(1);
+ WARN_ON(err > 1);
err = 0;
}
*errp = err;
if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
@@ -1100,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
bh = ext3_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
- if (buffer_uptodate(bh))
+ if (bh_uptodate_or_lock(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1202,6 +1237,16 @@ static void ext3_truncate_failed_write(struct inode *inode)
ext3_truncate(inode);
}
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+ ext3_block_truncate_page(inode, inode->i_size);
+ ext3_truncate(inode);
+}
+
static int ext3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1217,6 +1262,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
* we allocate blocks but write fails for some reason */
int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+ trace_ext3_write_begin(inode, pos, len, flags);
+
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1332,6 +1379,7 @@ static int ext3_ordered_write_end(struct file *file,
unsigned from, to;
int ret = 0, ret2;
+ trace_ext3_ordered_write_end(inode, pos, len, copied);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1367,6 +1415,7 @@ static int ext3_writeback_write_end(struct file *file,
struct inode *inode = file->f_mapping->host;
int ret;
+ trace_ext3_writeback_write_end(inode, pos, len, copied);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
update_file_sizes(inode, pos, copied);
/*
@@ -1391,10 +1440,12 @@ static int ext3_journalled_write_end(struct file *file,
{
handle_t *handle = ext3_journal_current_handle();
struct inode *inode = mapping->host;
+ struct ext3_inode_info *ei = EXT3_I(inode);
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
+ trace_ext3_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1419,8 +1470,9 @@ static int ext3_journalled_write_end(struct file *file,
if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
ext3_set_inode_state(inode, EXT3_STATE_JDATA);
- if (inode->i_size > EXT3_I(inode)->i_disksize) {
- EXT3_I(inode)->i_disksize = inode->i_size;
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+ if (inode->i_size > ei->i_disksize) {
+ ei->i_disksize = inode->i_size;
ret2 = ext3_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
@@ -1507,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data. This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
* refers to old data.
*
* In all journalling modes block_write_full_page() will start the I/O.
*
- * Problem:
- *
- * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext3_writepage()
- *
- * Similar for:
- *
- * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block(). We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
- *
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
- *
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
- *
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
- *
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
struct writeback_control *wbc)
@@ -1568,7 +1581,13 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
/*
* We give up here if we're reentered, because it might be for a
@@ -1577,6 +1596,7 @@ static int ext3_ordered_writepage(struct page *page,
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_ordered_writepage(page);
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1614,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ if (ret == 0)
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
err = ext3_journal_stop(handle);
@@ -1642,11 +1659,18 @@ static int ext3_writeback_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_writeback_writepage(page);
if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
@@ -1684,18 +1708,25 @@ static int ext3_journalled_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
-
- if (ext3_journal_current_handle())
- goto no_write;
-
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+ trace_ext3_journalled_writepage(page);
if (!page_has_buffers(page) || PageChecked(page)) {
+ if (ext3_journal_current_handle())
+ goto no_write;
+
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
@@ -1715,18 +1746,21 @@ static int ext3_journalled_writepage(struct page *page,
if (ret == 0)
ret = err;
ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+ atomic_set(&EXT3_I(inode)->i_datasync_tid,
+ handle->h_transaction->t_tid);
unlock_page(page);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
} else {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * It is a page full of checkpoint-mode buffers. Go and write
+ * them. They should have been already mapped when they went
+ * to the journal so provide NULL get_block function to catch
+ * errors.
*/
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, NULL, wbc);
}
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
out:
return ret;
@@ -1739,6 +1773,7 @@ out_unlock:
static int ext3_readpage(struct file *file, struct page *page)
{
+ trace_ext3_readpage(page);
return mpage_readpage(page, ext3_get_block);
}
@@ -1749,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_releasepage(page);
WARN_ON(PageChecked(page));
if (!page_has_buffers(page))
return 0;
@@ -1782,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1791,9 +1829,11 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
+ trace_ext3_direct_IO_enter(inode, offset, count, rw);
+
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1816,19 +1856,17 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext3_get_block, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
- vmtruncate(inode, isize);
+ ext3_truncate_failed_direct_write(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1842,8 +1880,10 @@ retry:
/* This is really bad luck. We've written the data
* but cannot extend i_size. Truncate allocated blocks
* and pretend the write failed... */
- ext3_truncate(inode);
+ ext3_truncate_failed_direct_write(inode);
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
goto out;
}
if (inode->i_nlink)
@@ -1868,6 +1908,7 @@ retry:
ret = err;
}
out:
+ trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -1894,7 +1935,6 @@ static const struct address_space_operations ext3_ordered_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
.writepage = ext3_ordered_writepage,
- .sync_page = block_sync_page,
.write_begin = ext3_write_begin,
.write_end = ext3_ordered_write_end,
.bmap = ext3_bmap,
@@ -1903,6 +1943,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
};
@@ -1910,7 +1951,6 @@ static const struct address_space_operations ext3_writeback_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
.writepage = ext3_writeback_writepage,
- .sync_page = block_sync_page,
.write_begin = ext3_write_begin,
.write_end = ext3_writeback_write_end,
.bmap = ext3_bmap,
@@ -1926,7 +1966,6 @@ static const struct address_space_operations ext3_journalled_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
.writepage = ext3_journalled_writepage,
- .sync_page = block_sync_page,
.write_begin = ext3_write_begin,
.write_end = ext3_journalled_write_end,
.set_page_dirty = ext3_journalled_set_page_dirty,
@@ -1953,17 +1992,24 @@ void ext3_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
- struct address_space *mapping, loff_t from)
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
{
ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, length, pos;
- struct inode *inode = mapping->host;
+ struct page *page;
+ handle_t *handle = NULL;
struct buffer_head *bh;
int err = 0;
+ /* Truncated on block boundary - nothing to do */
blocksize = inode->i_sb->s_blocksize;
+ if ((from & (blocksize - 1)) == 0)
+ return 0;
+
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page)
+ return -ENOMEM;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1999,20 +2045,30 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err)
goto unlock;
}
+ /* data=writeback mode doesn't need transaction to zero-out data */
+ if (!ext3_should_writeback_data(inode)) {
+ /* We journal at most one block */
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ err = PTR_ERR(handle);
+ goto unlock;
+ }
+ }
+
if (ext3_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext3_journal_get_write_access(handle, bh);
if (err)
- goto unlock;
+ goto stop;
}
zero_user(page, offset, length);
@@ -2026,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
err = ext3_journal_dirty_data(handle, bh);
mark_buffer_dirty(bh);
}
+stop:
+ if (handle)
+ ext3_journal_stop(handle);
unlock:
unlock_page(page);
@@ -2058,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
*
* When we do truncate() we may have to clean the ends of several
* indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
+ * partially truncated if some data below the new i_size is referred
* from it (and it is on the path to the first completely truncated
* data block, indeed). We have to free the top of that path along
* with everything to the right of the path. Since no allocation
@@ -2145,13 +2204,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, bh);
+ if (ext3_journal_dirty_metadata(handle, bh))
+ return;
}
ext3_mark_inode_dirty(handle, inode);
truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ext3_journal_get_write_access(handle, bh);
+ if (ext3_journal_get_write_access(handle, bh))
+ return;
}
}
@@ -2185,7 +2246,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
* @first: array of block numbers
* @last: points immediately past the end of array
*
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
* little-endian 32-bit) and updating @inode->i_blocks appropriately.
*
* We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -2273,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
* @last: pointer immediately past the end of array
* @depth: depth of the branches to free
*
- * We are freeing all blocks refered from these branches (numbers are
+ * We are freeing all blocks referred from these branches (numbers are
* stored as little-endian 32-bit) and updating @inode->i_blocks
* appropriately.
*/
@@ -2392,8 +2453,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
int ext3_can_truncate(struct inode *inode)
{
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return 0;
if (S_ISREG(inode->i_mode))
return 1;
if (S_ISDIR(inode->i_mode))
@@ -2410,7 +2469,7 @@ int ext3_can_truncate(struct inode *inode)
* transaction, and VFS/VM ensures that ext3_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -2437,7 +2496,6 @@ void ext3_truncate(struct inode *inode)
struct ext3_inode_info *ei = EXT3_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
int offsets[4];
Indirect chain[4];
Indirect *partial;
@@ -2445,7 +2503,8 @@ void ext3_truncate(struct inode *inode)
int n;
long last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
+
+ trace_ext3_truncate_enter(inode);
if (!ext3_can_truncate(inode))
goto out_notrans;
@@ -2453,37 +2512,12 @@ void ext3_truncate(struct inode *inode)
if (inode->i_size == 0 && ext3_should_writeback_data(inode))
ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- goto out_notrans;
- }
-
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
goto out_notrans;
- }
last_block = (inode->i_size + blocksize-1)
>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-
- if (page)
- ext3_block_truncate_page(handle, page, mapping, inode->i_size);
-
n = ext3_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
goto out_stop; /* error */
@@ -2598,6 +2632,7 @@ out_stop:
ext3_orphan_del(handle, inode);
ext3_journal_stop(handle);
+ trace_ext3_truncate_exit(inode);
return;
out_notrans:
/*
@@ -2606,6 +2641,7 @@ out_notrans:
*/
if (inode->i_nlink)
ext3_orphan_del(NULL, inode);
+ trace_ext3_truncate_exit(inode);
}
static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2659,12 +2695,12 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
"inode=%lu, block="E3FSBLK,
inode->i_ino, block);
- return -EIO;
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -2713,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -2747,9 +2783,10 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext3_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext3_error(inode->i_sb, "ext3_get_inode_loc",
@@ -2819,6 +2856,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
transaction_t *transaction;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -2835,13 +2874,15 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
@@ -2996,6 +3037,10 @@ static int ext3_do_update_inode(handle_t *handle,
struct ext3_inode_info *ei = EXT3_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
+ __le32 disksize;
+ uid_t i_uid;
+ gid_t i_gid;
again:
/* we can't allow multiple procs in here at once, its a bit racey */
@@ -3008,32 +3053,38 @@ again:
ext3_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if(!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ cpu_to_le16(fs_high2lowuid(i_uid));
raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+ disksize = cpu_to_le32(ei->i_disksize);
+ if (disksize != raw_inode->i_size) {
+ need_datasync = 1;
+ raw_inode->i_size = disksize;
+ }
raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3049,8 +3100,11 @@ again:
if (!S_ISREG(inode->i_mode)) {
raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
} else {
- raw_inode->i_size_high =
- cpu_to_le32(ei->i_disksize >> 32);
+ disksize = cpu_to_le32(ei->i_disksize >> 32);
+ if (disksize != raw_inode->i_size_high) {
+ raw_inode->i_size_high = disksize;
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3103,6 +3157,8 @@ again:
ext3_clear_inode_state(inode, EXT3_STATE_NEW);
atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+ if (need_datasync)
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
out_brelse:
brelse (bh);
ext3_std_error(inode->i_sb, err);
@@ -3114,21 +3170,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3140,13 +3195,13 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (ext3_journal_current_handle()) {
@@ -3155,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext3_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
return ext3_force_commit(inode->i_sb);
@@ -3190,8 +3250,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -3217,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
ext3_journal_stop(handle);
}
+ if (attr->ia_valid & ATTR_SIZE)
+ inode_dio_wait(inode);
+
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
handle_t *handle;
@@ -3228,25 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
}
error = ext3_orphan_add(handle, inode);
+ if (error) {
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
EXT3_I(inode)->i_disksize = attr->ia_size;
- rc = ext3_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+ error = ext3_mark_inode_dirty(handle, inode);
ext3_journal_stop(handle);
+ if (error) {
+ /* Some hard fs error must have happened. Bail out. */
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ rc = ext3_block_truncate_page(inode, attr->ia_size);
+ if (rc) {
+ /* Cleanup orphan list and exit */
+ handle = ext3_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext3_orphan_del(handle, inode);
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
}
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- rc = vmtruncate(inode, attr->ia_size);
- if (rc)
- goto err_out;
+ truncate_setsize(inode, attr->ia_size);
+ ext3_truncate(inode);
}
setattr_copy(inode, attr);
mark_inode_dirty(inode);
if (ia_valid & ATTR_MODE)
- rc = ext3_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext3_std_error(inode->i_sb, error);
@@ -3292,7 +3373,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
if (ext3_should_journal_data(inode))
ret = 3 * (bpp + indirects) + 2;
else
- ret = 2 * (bpp + indirects) + 2;
+ ret = 2 * (bpp + indirects) + indirects + 2;
#ifdef CONFIG_QUOTA
/* We know that structure was already allocated during dquot_initialize so
@@ -3358,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -3373,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err;
might_sleep();
+ trace_ext3_mark_inode_dirty(inode, _RET_IP_);
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (!err)
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
@@ -3393,7 +3467,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
{
handle_t *current_handle = ext3_journal_current_handle();
handle_t *handle;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783..4d96e9a6453 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -7,19 +7,14 @@
* Universite Pierre et Marie Curie (Paris VI)
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/capability.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
#include <linux/mount.h>
-#include <linux/time.h>
#include <linux/compat.h>
#include <asm/uaccess.h>
+#include "ext3.h"
long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
@@ -38,13 +33,13 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
unsigned int oldflags;
unsigned int jflag;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -110,7 +105,7 @@ flags_err:
err = ext3_change_inode_journal_flag(inode, jflag);
flags_out:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -123,10 +118,10 @@ flags_out:
__u32 generation;
int err;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
if (get_user(generation, (int __user *) arg)) {
@@ -134,10 +129,11 @@ flags_out:
goto setversion_out;
}
+ mutex_lock(&inode->i_mutex);
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto setversion_out;
+ goto unlock_out;
}
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
@@ -146,34 +142,13 @@ flags_out:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
setversion_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
-#ifdef CONFIG_JBD_DEBUG
- case EXT3_IOC_WAIT_FOR_READONLY:
- /*
- * This is racy - by the time we're woken up and running,
- * the superblock could be released. And the module could
- * have been unloaded. So sue me.
- *
- * Returns 1 if it slept, else zero.
- */
- {
- struct super_block *sb = inode->i_sb;
- DECLARE_WAITQUEUE(wait, current);
- int ret = 0;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
- schedule();
- ret = 1;
- }
- remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
- return ret;
- }
-#endif
case EXT3_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
@@ -188,11 +163,11 @@ setversion_out:
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
- if (!is_owner_or_cap(inode)) {
+ if (!inode_owner_or_capable(inode)) {
err = -EACCES;
goto setrsvsz_out;
}
@@ -219,7 +194,7 @@ setversion_out:
}
mutex_unlock(&ei->truncate_mutex);
setrsvsz_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GROUP_EXTEND: {
@@ -230,7 +205,7 @@ setrsvsz_out:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -245,7 +220,7 @@ setrsvsz_out:
if (err == 0)
err = err2;
group_extend_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GROUP_ADD: {
@@ -256,7 +231,7 @@ group_extend_out:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -273,10 +248,32 @@ group_extend_out:
if (err == 0)
err = err2;
group_add_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
+ case FITRIM: {
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ext3_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
default:
return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..f197736dccf 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -24,19 +24,8 @@
* Theodore Ts'o, 2002
*/
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/jbd.h>
-#include <linux/time.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
-#include <linux/bio.h>
-
+#include "ext3.h"
#include "namei.h"
#include "xattr.h"
#include "acl.h"
@@ -47,7 +36,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext3_append(handle_t *handle,
struct inode *inode,
@@ -57,8 +45,7 @@ static struct buffer_head *ext3_append(handle_t *handle,
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext3_bread(handle, inode, *block, 1, err);
- if (bh) {
+ if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
inode->i_size += inode->i_sb->s_blocksize;
EXT3_I(inode)->i_disksize = inode->i_size;
*err = ext3_journal_get_write_access(handle, bh);
@@ -287,7 +274,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
while (len--) printk("%c", *name++);
ext3fs_dirhash(de->name, de->name_len, &h);
printk(":%x.%u ", h.hash,
- ((char *) de - base));
+ (unsigned) ((char *) de - base));
}
space += EXT3_DIR_REC_LEN(de->name_len);
names++;
@@ -350,8 +337,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
+ *err = ERR_BAD_DX_DIR;
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -447,8 +436,10 @@ dx_probe(struct qstr *entry, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
+ *err = ERR_BAD_DX_DIR;
goto fail2;
+ }
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext3_warning(dir->i_sb, __func__,
@@ -546,8 +537,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
+ if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
+ 0, &err)))
return err; /* Failure */
p++;
brelse (p->bh);
@@ -570,10 +561,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext3_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
- if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
+
+ if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
return err;
de = (struct ext3_dir_entry_2 *) bh->b_data;
@@ -584,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -631,7 +620,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
start_minor_hash));
- dir = dir_file->f_path.dentry->d_inode;
+ dir = file_inode(dir_file);
if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
@@ -645,7 +634,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+ frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
if (!frame)
return err;
@@ -858,6 +847,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
struct buffer_head * bh_use[NAMEI_RA_SIZE];
struct buffer_head * bh, *ret = NULL;
unsigned long start, block, b;
+ const u8 *name = entry->name;
int ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
int ra_ptr = 0; /* Current index into readahead
@@ -871,6 +861,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
namelen = entry->len;
if (namelen > EXT3_NAME_LEN)
return NULL;
+ if ((namelen <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == 0)) {
+ /*
+ * "." or ".." will only be in the first block
+ * NFS may look up ".."; "." should be handled by the VFS
+ */
+ block = start = 0;
+ nblocks = 1;
+ goto restart;
+ }
if (is_dx(dir)) {
bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
/*
@@ -909,8 +909,12 @@ restart:
num++;
bh = ext3_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
- if (bh)
- ll_rw_block(READ_META, 1, &bh);
+ if (bh && !bh_uptodate_or_lock(bh)) {
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO,
+ bh);
+ }
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -961,55 +965,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
int *err)
{
- struct super_block * sb;
+ struct super_block *sb = dir->i_sb;
struct dx_hash_info hinfo;
- u32 hash;
struct dx_frame frames[2], *frame;
- struct ext3_dir_entry_2 *de, *top;
struct buffer_head *bh;
unsigned long block;
int retval;
- int namelen = entry->len;
- const u8 *name = entry->name;
- sb = dir->i_sb;
- /* NFS may look up ".." - look at dx_root directory block */
- if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
- if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
- return NULL;
- } else {
- frame = frames;
- frame->bh = NULL; /* for dx_release() */
- frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
- dx_set_block(frame->at, 0); /* dx_root block is 0 */
- }
- hash = hinfo.hash;
+ if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
+ return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
goto errout;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
- EXT3_DIR_REC_LEN(0));
- for (; de < top; de = ext3_next_entry(de)) {
- int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
- + ((char *) de - bh->b_data);
-
- if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto errout;
- }
- if (ext3_match(namelen, name, de)) {
- *res_dir = de;
- dx_release(frames);
- return bh;
- }
+ retval = search_dirblock(bh, dir, entry,
+ block << EXT3_BLOCK_SIZE_BITS(sb),
+ res_dir);
+ if (retval == 1) {
+ dx_release(frames);
+ return bh;
}
- brelse (bh);
+ brelse(bh);
+ if (retval == -1) {
+ *err = ERR_BAD_DX_DIR;
+ goto errout;
+ }
+
/* Check to see if we should continue to search */
- retval = ext3_htree_next_block(dir, hash, frame,
+ retval = ext3_htree_next_block(dir, hinfo.hash, frame,
frames, NULL);
if (retval < 0) {
ext3_warning(sb, __func__,
@@ -1022,12 +1006,12 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
*err = -ENOENT;
errout:
- dxtrace(printk("%s not found\n", name));
+ dxtrace(printk("%s not found\n", entry->name));
dx_release (frames);
return NULL;
}
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
{
struct inode * inode;
struct ext3_dir_entry_2 * de;
@@ -1047,15 +1031,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
return ERR_PTR(-EIO);
}
inode = ext3_iget(dir->i_sb, ino);
- if (unlikely(IS_ERR(inode))) {
- if (PTR_ERR(inode) == -ESTALE) {
- ext3_error(dir->i_sb, __func__,
- "deleted inode referenced: %lu",
- ino);
- return ERR_PTR(-EIO);
- } else {
- return ERR_CAST(inode);
- }
+ if (inode == ERR_PTR(-ESTALE)) {
+ ext3_error(dir->i_sb, __func__,
+ "deleted inode referenced: %lu",
+ ino);
+ return ERR_PTR(-EIO);
}
}
return d_splice_alias(inode, dentry);
@@ -1065,7 +1045,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
struct dentry *ext3_get_parent(struct dentry *child)
{
unsigned long ino;
- struct qstr dotdot = {.name = "..", .len = 2};
+ struct qstr dotdot = QSTR_INIT("..", 2);
struct ext3_dir_entry_2 * de;
struct buffer_head *bh;
@@ -1425,10 +1405,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+ /*
+ * Mark buffers dirty here so that if do_split() fails we write a
+ * consistent set of buffers to disk.
+ */
+ ext3_journal_dirty_metadata(handle, frame->bh);
+ ext3_journal_dirty_metadata(handle, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ ext3_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
@@ -1469,9 +1458,9 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- bh = ext3_bread(handle, dir, block, 0, &retval);
- if(!bh)
+ if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
return retval;
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC)
return retval;
@@ -1511,7 +1500,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
entries = frame->entries;
at = frame->at;
- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
goto cleanup;
BUFFER_TRACE(bh, "get_write_access");
@@ -1549,8 +1538,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext3_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1607,7 +1596,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- ext3_journal_dirty_metadata(handle, frames[0].bh);
+ err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+ if (err)
+ goto journal_error;
}
de = do_split(handle, dir, &bh, frame, &hinfo, &err);
if (!de)
@@ -1644,8 +1635,13 @@ static int ext3_delete_entry (handle_t *handle,
if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
return -EIO;
if (de == de_del) {
+ int err;
+
BUFFER_TRACE(bh, "get_write_access");
- ext3_journal_get_write_access(handle, bh);
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err)
+ goto journal_error;
+
if (pde)
pde->rec_len = ext3_rec_len_to_disk(
ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1650,12 @@ static int ext3_delete_entry (handle_t *handle,
de->inode = 0;
dir->i_version++;
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err) {
+journal_error:
+ ext3_std_error(dir->i_sb, err);
+ return err;
+ }
return 0;
}
i += ext3_rec_len_from_disk(de->rec_len);
@@ -1670,8 +1671,8 @@ static int ext3_add_nondir(handle_t *handle,
int err = ext3_add_entry(handle, dentry, inode);
if (!err) {
ext3_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
return 0;
}
drop_nlink(inode);
@@ -1688,8 +1689,8 @@ static int ext3_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+ bool excl)
{
handle_t *handle;
struct inode * inode;
@@ -1707,7 +1708,7 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext3_file_inode_operations;
@@ -1722,7 +1723,7 @@ retry:
}
static int ext3_mknod (struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -1743,7 +1744,7 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1758,11 +1759,49 @@ retry:
return err;
}
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ 4 + EXT3_XATTR_TRANS_BLOCKS);
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ inode = ext3_new_inode (handle, dir, NULL, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ ext3_set_aops(inode);
+ d_tmpfile(dentry, inode);
+ err = ext3_orphan_add(handle, inode);
+ if (err)
+ goto err_unlock_inode;
+ mark_inode_dirty(inode);
+ unlock_new_inode(inode);
+ }
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_unlock_inode:
+ ext3_journal_stop(handle);
+ unlock_new_inode(inode);
+ return err;
+}
+
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
handle_t *handle;
struct inode * inode;
- struct buffer_head * dir_block;
+ struct buffer_head * dir_block = NULL;
struct ext3_dir_entry_2 * de;
int err, retries = 0;
@@ -1781,7 +1820,7 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -1789,16 +1828,14 @@ retry:
inode->i_op = &ext3_dir_inode_operations;
inode->i_fop = &ext3_dir_operations;
inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
- drop_nlink(inode); /* is this nlink == 0? */
- unlock_new_inode(inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
- }
+ if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
+ goto out_clear_inode;
+
BUFFER_TRACE(dir_block, "get_write_access");
- ext3_journal_get_write_access(handle, dir_block);
+ err = ext3_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out_clear_inode;
+
de = (struct ext3_dir_entry_2 *) dir_block->b_data;
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
@@ -1812,14 +1849,19 @@ retry:
de->name_len = 2;
strcpy (de->name, "..");
ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_block);
- brelse (dir_block);
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_entry (handle, dentry, inode);
+ err = ext3_journal_dirty_metadata(handle, dir_block);
+ if (err)
+ goto out_clear_inode;
+
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (!err)
+ err = ext3_add_entry (handle, dentry, inode);
+
if (err) {
- inode->i_nlink = 0;
+out_clear_inode:
+ clear_nlink(inode);
unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
@@ -1827,10 +1869,14 @@ retry:
}
inc_nlink(dir);
ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- d_instantiate(dentry, inode);
+ err = ext3_mark_inode_dirty(handle, dir);
+ if (err)
+ goto out_clear_inode;
+
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
out_stop:
+ brelse(dir_block);
ext3_journal_stop(handle);
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
@@ -1850,7 +1896,7 @@ static int empty_dir (struct inode * inode)
sb = inode->i_sb;
if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
- !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+ !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
if (err)
ext3_error(inode->i_sb, __func__,
"error %d reading directory #%lu offset 0",
@@ -1881,9 +1927,8 @@ static int empty_dir (struct inode * inode)
(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
err = 0;
brelse (bh);
- bh = ext3_bread (NULL, inode,
- offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
- if (!bh) {
+ if (!(bh = ext3_dir_bread (NULL, inode,
+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
if (err)
ext3_error(sb, __func__,
"error %d reading directory"
@@ -2124,6 +2169,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
struct ext3_dir_entry_2 * de;
handle_t *handle;
+ trace_ext3_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
dquot_initialize(dir);
@@ -2151,7 +2197,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
ext3_warning (inode->i_sb, "ext3_unlink",
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
}
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
@@ -2169,6 +2215,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
end_unlink:
ext3_journal_stop(handle);
brelse (bh);
+ trace_ext3_unlink_exit(dentry, retval);
return retval;
}
@@ -2178,6 +2225,7 @@ static int ext3_symlink (struct inode * dir,
handle_t *handle;
struct inode * inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2185,36 +2233,78 @@ static int ext3_symlink (struct inode * dir,
dquot_initialize(dir);
+ if (l > EXT3_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks, and
+ * possibly selinux xattr blocks.
+ */
+ credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT3_XATTR_TRANS_BLOCKS;
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext3_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT3_I(inode)->i_data)) {
+ if (l > EXT3_N_BLOCKS * 4) {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
/*
- * page_symlink() calls into ext3_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext3_write_begin() which acquires page
+ * lock which ranks below transaction start (and it can also
+ * wait for journal commit if we are running out of space). So
+ * we have to stop transaction now and restart it when symlink
+ * contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext3_orphan_add(handle, inode);
+ ext3_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+ * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext3_journal_start(dir,
+ EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ set_nlink(inode, 1);
+ err = ext3_orphan_del(handle, inode);
if (err) {
+ ext3_journal_stop(handle);
drop_nlink(inode);
- unlock_new_inode(inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2228,6 +2318,10 @@ out_stop:
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext3_link (struct dentry * old_dentry,
@@ -2242,16 +2336,9 @@ static int ext3_link (struct dentry * old_dentry,
dquot_initialize(dir);
- /*
- * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
- * otherwise has the potential to corrupt the orphan inode list.
- */
- if (inode->i_nlink == 0)
- return -ENOENT;
-
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2265,6 +2352,11 @@ retry:
err = ext3_add_entry(handle, dentry, inode);
if (!err) {
ext3_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext3_orphan_del(handle, inode);
d_instantiate(dentry, inode);
} else {
drop_nlink(inode);
@@ -2337,7 +2429,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
- dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+ dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
@@ -2353,7 +2445,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
} else {
BUFFER_TRACE(new_bh, "get write access");
- ext3_journal_get_write_access(handle, new_bh);
+ retval = ext3_journal_get_write_access(handle, new_bh);
+ if (retval)
+ goto journal_error;
new_de->inode = cpu_to_le32(old_inode->i_ino);
if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2456,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, new_bh);
+ retval = ext3_journal_dirty_metadata(handle, new_bh);
+ if (retval)
+ goto journal_error;
brelse(new_bh);
new_bh = NULL;
}
@@ -2411,10 +2507,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
ext3_update_dx_flag(old_dir);
if (dir_bh) {
BUFFER_TRACE(dir_bh, "get_write_access");
- ext3_journal_get_write_access(handle, dir_bh);
+ retval = ext3_journal_get_write_access(handle, dir_bh);
+ if (retval)
+ goto journal_error;
PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_bh);
+ retval = ext3_journal_dirty_metadata(handle, dir_bh);
+ if (retval) {
+journal_error:
+ ext3_std_error(new_dir->i_sb, retval);
+ goto end_rename;
+ }
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
@@ -2456,6 +2559,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.mkdir = ext3_mkdir,
.rmdir = ext3_rmdir,
.mknod = ext3_mknod,
+ .tmpfile = ext3_tmpfile,
.rename = ext3_rename,
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
@@ -2464,7 +2568,8 @@ const struct inode_operations ext3_dir_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .check_acl = ext3_check_acl,
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
};
const struct inode_operations ext3_special_inode_operations = {
@@ -2475,5 +2580,6 @@ const struct inode_operations ext3_special_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .check_acl = ext3_check_acl,
+ .get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
index f2ce2b0065c..46304d8c9f0 100644
--- a/fs/ext3/namei.h
+++ b/fs/ext3/namei.h
@@ -6,3 +6,22 @@
*/
extern struct dentry *ext3_get_parent(struct dentry *child);
+
+static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
+ struct inode *inode,
+ int block, int create,
+ int *err)
+{
+ struct buffer_head *bh;
+
+ bh = ext3_bread(handle, inode, block, create, err);
+
+ if (!bh && !(*err)) {
+ *err = -EIO;
+ ext3_error(inode->i_sb, __func__,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ return NULL;
+ }
+ return bh;
+}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b123..27105655502 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -11,10 +11,7 @@
#define EXT3FS_DEBUG
-#include <linux/ext3_jbd.h>
-
-#include <linux/errno.h>
-#include <linux/slab.h>
+#include "ext3.h"
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
@@ -119,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -237,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto exit_bh;
}
if ((err = ext3_journal_get_write_access(handle, gdb))) {
@@ -249,7 +246,11 @@ static int setup_new_group_blocks(struct super_block *sb,
memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
unlock_buffer(gdb);
- ext3_journal_dirty_metadata(handle, gdb);
+ err = ext3_journal_dirty_metadata(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto exit_bh;
+ }
ext3_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -269,7 +270,11 @@ static int setup_new_group_blocks(struct super_block *sb,
err = PTR_ERR(gdb);
goto exit_bh;
}
- ext3_journal_dirty_metadata(handle, gdb);
+ err = ext3_journal_dirty_metadata(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto exit_bh;
+ }
ext3_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -295,7 +300,11 @@ static int setup_new_group_blocks(struct super_block *sb,
err = PTR_ERR(it);
goto exit_bh;
}
- ext3_journal_dirty_metadata(handle, it);
+ err = ext3_journal_dirty_metadata(handle, it);
+ if (err) {
+ brelse(it);
+ goto exit_bh;
+ }
brelse(it);
ext3_set_bit(bit, bh->b_data);
}
@@ -306,7 +315,9 @@ static int setup_new_group_blocks(struct super_block *sb,
mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
bh->b_data);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto exit_bh;
brelse(bh);
/* Mark unused entries in inode bitmap used */
@@ -319,7 +330,7 @@ static int setup_new_group_blocks(struct super_block *sb,
mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
bh->b_data);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
exit_bh:
brelse(bh);
@@ -503,12 +514,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
* reserved inode, and will become GDT blocks (primary and backup).
*/
data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
- ext3_journal_dirty_metadata(handle, dind);
+ err = ext3_journal_dirty_metadata(handle, dind);
+ if (err)
+ goto exit_group_desc;
brelse(dind);
+ dind = NULL;
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
- ext3_mark_iloc_dirty(handle, inode, &iloc);
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto exit_group_desc;
memset((*primary)->b_data, 0, sb->s_blocksize);
- ext3_journal_dirty_metadata(handle, *primary);
+ err = ext3_journal_dirty_metadata(handle, *primary);
+ if (err)
+ goto exit_group_desc;
o_group_desc = EXT3_SB(sb)->s_group_desc;
memcpy(n_group_desc, o_group_desc,
@@ -519,10 +537,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
kfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ if (err)
+ goto exit_inode;
return 0;
+exit_group_desc:
+ kfree(n_group_desc);
exit_inode:
//ext3_journal_release_buffer(handle, iloc.bh);
brelse(iloc.bh);
@@ -700,22 +722,26 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext3_debug("update metadata backup %#04lx\n",
(unsigned long)bh->b_blocknr);
- if ((err = ext3_journal_get_write_access(handle, bh)))
+ if ((err = ext3_journal_get_write_access(handle, bh))) {
+ brelse(bh);
break;
+ }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ext3_journal_dirty_metadata(handle, bh);
+ err = ext3_journal_dirty_metadata(handle, bh);
brelse(bh);
+ if (err)
+ break;
}
if ((err2 = ext3_journal_stop(handle)) && !err)
err = err2;
@@ -922,7 +948,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext3_journal_dirty_metadata(handle, primary);
+ err = ext3_journal_dirty_metadata(handle, primary);
+ if (err)
+ goto exit_journal;
/* Update the reserved block counts only once the new group is
* active. */
@@ -934,7 +962,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT3_INODES_PER_GROUP(sb));
- ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
exit_journal:
mutex_unlock(&sbi->s_resize_lock);
@@ -978,7 +1006,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
- " upto "E3FSBLK" blocks\n",
+ " up to "E3FSBLK" blocks\n",
o_blocks_count, n_blocks_count);
if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -1064,8 +1092,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
goto exit_put;
}
es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
- ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+ if (err) {
+ ext3_warning(sb, __func__,
+ "error %d on journal dirty metadata", err);
+ ext3_journal_stop(handle);
+ goto exit_put;
+ }
ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
o_blocks_count, o_blocks_count + add);
ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b501..08cdfe5461e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -17,29 +17,23 @@
*/
#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/slab.h>
-#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/parser.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
#include <linux/exportfs.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
#include <linux/random.h>
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/log2.h>
+#include <linux/cleancache.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
+#define CREATE_TRACE_POINTS
+
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
#include "namei.h"
@@ -71,11 +65,6 @@ static int ext3_freeze(struct super_block *sb);
/*
* Wrappers for journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
*/
handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
{
@@ -97,12 +86,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
return journal_start(journal, nblocks);
}
-/*
- * The only special thing we need to do here is to make sure that all
- * journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
int __ext3_journal_stop(const char *where, handle_t *handle)
{
struct super_block *sb;
@@ -144,12 +127,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
void ext3_msg(struct super_block *sb, const char *prefix,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
va_end(args);
}
@@ -188,6 +175,11 @@ static void ext3_handle_error(struct super_block *sb)
if (test_opt (sb, ERRORS_RO)) {
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
ext3_commit_super(sb, es, 1);
@@ -196,15 +188,20 @@ static void ext3_handle_error(struct super_block *sb)
sb->s_id);
}
-void ext3_error (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
ext3_handle_error(sb);
@@ -275,15 +272,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
* case we take the easy way out and panic immediately.
*/
-void ext3_abort (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
if (test_opt(sb, ERRORS_PANIC))
@@ -295,22 +297,32 @@ void ext3_abort (struct super_block * sb, const char * function,
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- sb->s_flags |= MS_RDONLY;
set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
-void ext3_warning (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext3_warning(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
- sb->s_id, function);
- vprintk(fmt, args);
- printk("\n");
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+ sb->s_id, function, &vaf);
+
va_end(args);
}
@@ -347,13 +359,13 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
fail:
- ext3_msg(sb, "error: failed to open journal device %s: %ld",
+ ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
__bdevname(dev, b), PTR_ERR(bdev));
return NULL;
@@ -362,23 +374,19 @@ fail:
/*
* Release the journal device
*/
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
{
struct block_device *bdev;
- int ret = -ENODEV;
-
bdev = sbi->journal_bdev;
if (bdev) {
- ret = ext3_blkdev_put(bdev);
+ ext3_blkdev_put(bdev);
sbi->journal_bdev = NULL;
}
- return ret;
}
static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -480,6 +488,20 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
return &ei->vfs_inode;
}
+static int ext3_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext3_drop_inode(inode, drop);
+ return drop;
+}
+
+static void ext3_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
+
static void ext3_destroy_inode(struct inode *inode)
{
if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -490,7 +512,7 @@ static void ext3_destroy_inode(struct inode *inode)
false);
dump_stack();
}
- kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+ call_rcu(&inode->i_rcu, ext3_i_callback);
}
static void init_once(void *foo)
@@ -505,7 +527,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
sizeof(struct ext3_inode_info),
@@ -519,6 +541,11 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(ext3_inode_cachep);
}
@@ -576,9 +603,9 @@ static char *data_mode_string(unsigned long mode)
* - it's set to a non-default value OR
* - if the per-sb default is different from the global default
*/
-static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext3_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct ext3_sb_info *sbi = EXT3_SB(sb);
struct ext3_super_block *es = sbi->s_es;
unsigned long def_mount_opts;
@@ -593,13 +620,15 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",grpid");
if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT3_DEF_RESUID ||
+ if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+ seq_printf(seq, ",resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
}
- if (sbi->s_resgid != EXT3_DEF_RESGID ||
+ if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+ seq_printf(seq, ",resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
}
if (test_opt(sb, ERRORS_RO)) {
int def_errors = le16_to_cpu(es->s_errors);
@@ -617,8 +646,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nouid32");
if (test_opt(sb, DEBUG))
seq_puts(seq, ",debug");
- if (test_opt(sb, OLDALLOC))
- seq_puts(seq, ",oldalloc");
#ifdef CONFIG_EXT3_FS_XATTR
if (test_opt(sb, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -731,7 +758,7 @@ static int ext3_release_dquot(struct dquot *dquot);
static int ext3_mark_dquot_dirty(struct dquot *dquot);
static int ext3_write_info(struct super_block *sb, int type);
static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- char *path);
+ struct path *path);
static int ext3_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -764,6 +791,7 @@ static const struct super_operations ext3_sops = {
.destroy_inode = ext3_destroy_inode,
.write_inode = ext3_write_inode,
.dirty_inode = ext3_dirty_inode,
+ .drop_inode = ext3_drop_inode,
.evict_inode = ext3_evict_inode,
.put_super = ext3_put_super,
.sync_fs = ext3_sync_fs,
@@ -792,6 +820,7 @@ enum {
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_journal_path,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -833,6 +862,7 @@ static const match_table_t tokens = {
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
@@ -868,7 +898,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
/*todo: use simple_strtoll with >32bit ext3 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
- ext3_msg(sb, "error: invalid sb specification: %s",
+ ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
(char *) *data);
return 1;
}
@@ -897,21 +927,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return 0;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
+ if (sbi->s_qf_names[qtype]) {
+ int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
kfree(qname);
- return 0;
+ if (!same) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ }
+ return same;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext3_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ kfree(qname);
return 0;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sbi->s_mount_opt, QUOTA);
return 1;
}
@@ -926,11 +959,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {
" when quota turned on");
return 0;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
+ if (sbi->s_qf_names[qtype]) {
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ }
return 1;
}
#endif
@@ -944,6 +976,13 @@ static int parse_options (char *options, struct super_block *sb,
substring_t args[MAX_OPT_ARGS];
int data_opt = 0;
int option;
+ kuid_t uid;
+ kgid_t gid;
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
#ifdef CONFIG_QUOTA
int qfmt;
#endif
@@ -959,7 +998,7 @@ static int parse_options (char *options, struct super_block *sb,
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
@@ -977,12 +1016,23 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_resuid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resuid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+ return 0;
+
+ }
+ sbi->s_resuid = uid;
break;
case Opt_resgid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resgid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+ return 0;
+ }
+ sbi->s_resgid = gid;
break;
case Opt_sb:
/* handled by get_sb_block() instead of here */
@@ -1013,10 +1063,12 @@ static int parse_options (char *options, struct super_block *sb,
set_opt (sbi->s_mount_opt, DEBUG);
break;
case Opt_oldalloc:
- set_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated oldalloc option");
break;
case Opt_orlov:
- clear_opt (sbi->s_mount_opt, OLDALLOC);
+ ext3_msg(sb, KERN_WARNING,
+ "Ignoring deprecated orlov option");
break;
#ifdef CONFIG_EXT3_FS_XATTR
case Opt_user_xattr:
@@ -1085,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb,
return 0;
*journal_devnum = option;
break;
+ case Opt_journal_path:
+ if (is_remount) {
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
+ return 0;
+ }
+
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext3_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return 0;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext3_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return 0;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext3_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return 0;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ break;
case Opt_noload:
set_opt (sbi->s_mount_opt, NOLOAD);
break;
@@ -1344,6 +1431,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
} else {
ext3_msg(sb, KERN_INFO, "using internal journal");
}
+ cleancache_init_fs(sb);
return res;
}
@@ -1441,11 +1529,20 @@ static void ext3_orphan_cleanup (struct super_block * sb,
return;
}
+ /* Check if feature set allows readwrite operations */
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+ ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -1617,9 +1714,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
return -ENOMEM;
}
sb->s_fs_info = sbi;
- sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT3_DEF_RESUID;
- sbi->s_resgid = EXT3_DEF_RESGID;
sbi->s_sb_block = sb_block;
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
@@ -1683,9 +1777,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
else
set_opt(sbi->s_mount_opt, ERRORS_RO);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ /* enable barriers by default */
+ set_opt(sbi->s_mount_opt, BARRIER);
set_opt(sbi->s_mount_opt, RESERVATION);
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
@@ -1842,13 +1938,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (generic_check_addressable(sb->s_blocksize_bits,
- le32_to_cpu(es->s_blocks_count))) {
+ err = generic_check_addressable(sb->s_blocksize_bits,
+ le32_to_cpu(es->s_blocks_count));
+ if (err) {
ext3_msg(sb, KERN_ERR,
"error: filesystem is too large to mount safely");
if (sizeof(sector_t) < 8)
ext3_msg(sb, KERN_ERR,
"error: CONFIG_LBDAF not enabled");
+ ret = err;
goto failed_mount;
}
@@ -1911,6 +2009,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext3_qctl_operations;
sb->dq_op = &ext3_quota_operations;
#endif
+ memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
mutex_init(&sbi->s_resize_lock);
@@ -1998,22 +2097,23 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
goto failed_mount3;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
- iput(root);
ret = -ENOMEM;
goto failed_mount3;
}
- ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
- if (needs_recovery)
+ if (needs_recovery) {
+ ext3_mark_recovery_complete(sb, es);
ext3_msg(sb, KERN_INFO, "recovery complete");
- ext3_mark_recovery_complete(sb, es);
+ }
ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
@@ -2136,13 +2236,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
if (bdev == NULL)
return NULL;
- if (bd_claim(bdev, sb)) {
- ext3_msg(sb, KERN_ERR,
- "error: failed to claim external journal device");
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
- return NULL;
- }
-
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -2188,11 +2281,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
- ext3_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
+ if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
+ if (bh_submit_read(journal->j_sb_buffer)) {
+ ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+ goto out_journal;
+ }
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
ext3_msg(sb, KERN_ERR,
@@ -2291,7 +2384,7 @@ static int ext3_load_journal(struct super_block *sb,
EXT3_SB(sb)->s_journal = journal;
ext3_clear_journal_err(sb, es);
- if (journal_devnum &&
+ if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -2479,6 +2572,12 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
{
tid_t target;
+ trace_ext3_sync_fs(sb, wait);
+ /*
+ * Writeback quota in non-journalled quota case - journalled quota has
+ * no dirty dquots
+ */
+ dquot_writeback_dquots(sb, -1);
if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
if (wait)
log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2529,11 +2628,9 @@ out:
static int ext3_unfreeze(struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY)) {
- lock_super(sb);
/* Reser the needs_recovery flag before the fs is unlocked. */
EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
- unlock_super(sb);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
}
return 0;
@@ -2552,8 +2649,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
int i;
#endif
+ sync_filesystem(sb);
+
/* Store the original options */
- lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
old_opts.s_resuid = sbi->s_resuid;
@@ -2562,7 +2660,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
/*
@@ -2626,13 +2735,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
/*
* If we have an unprocessed orphan list hanging
* around from a previously readonly bdev mount,
- * require a full umount/remount for now.
+ * require a full umount & mount for now.
*/
if (es->s_last_orphan) {
ext3_msg(sb, KERN_WARNING, "warning: couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
- "umount/remount instead.");
+ "umount & mount instead.");
err = -EINVAL;
goto restore_opts;
}
@@ -2655,12 +2764,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
#endif
- unlock_super(sb);
-
if (enable_quota)
dquot_resume(sb, -1);
return 0;
@@ -2673,13 +2778,10 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
- unlock_super(sb);
return err;
}
@@ -2725,6 +2827,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
* bitmap, and an inode table.
*/
overhead += ngroups * (2 + sbi->s_itb_per_group);
+
+ /* Add the journal blocks as well */
+ overhead += sbi->s_journal->j_maxlen;
+
sbi->s_overhead_last = overhead;
smp_wmb();
sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
@@ -2761,7 +2867,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}
static int ext3_write_dquot(struct dquot *dquot)
@@ -2859,27 +2965,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
* Standard function to be called on quota_on
*/
static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- char *name)
+ struct path *path)
{
int err;
- struct path path;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- err = kern_path(name, LOOKUP_FOLLOW, &path);
- if (err)
- return err;
-
/* Quotafile not on the same filesystem? */
- if (path.mnt->mnt_sb != sb) {
- path_put(&path);
+ if (path->dentry->d_sb != sb)
return -EXDEV;
- }
/* Journaling quota? */
if (EXT3_SB(sb)->s_qf_names[type]) {
/* Quotafile not of fs root? */
- if (path.dentry->d_parent != sb->s_root)
+ if (path->dentry->d_parent != sb->s_root)
ext3_msg(sb, KERN_WARNING,
"warning: Quota file not on filesystem root. "
"Journaled quota will not work.");
@@ -2889,7 +2988,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
* When we journal data on quota file, we have to flush journal to see
* all updates to the file when we bypass pagecache...
*/
- if (ext3_should_journal_data(path.dentry->d_inode)) {
+ if (ext3_should_journal_data(path->dentry->d_inode)) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
@@ -2897,20 +2996,16 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
journal_lock_updates(EXT3_SB(sb)->s_journal);
err = journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
- if (err) {
- path_put(&path);
+ if (err)
return err;
- }
}
- err = dquot_quota_on_path(sb, type, format_id, &path);
- path_put(&path);
- return err;
+ return dquot_quota_on(sb, type, format_id, path);
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
* we don't have to be afraid of races */
static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
@@ -2979,7 +3074,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
bh = ext3_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
@@ -3003,10 +3097,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
}
brelse(bh);
out:
- if (err) {
- mutex_unlock(&inode->i_mutex);
+ if (err)
return err;
- }
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT3_I(inode)->i_disksize = inode->i_size;
@@ -3014,7 +3106,6 @@ out:
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
- mutex_unlock(&inode->i_mutex);
return len;
}
@@ -3033,6 +3124,7 @@ static struct file_system_type ext3_fs_type = {
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("ext3");
static int __init init_ext3_fs(void)
{
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 7c489820777..6b01c3eab1f 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,10 +17,8 @@
* ext3 symlink handling code
*/
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/namei.h>
+#include "ext3.h"
#include "xattr.h"
static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa8..c6874be6d58 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -50,14 +50,9 @@
* by the buffer lock.
*/
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include <linux/mbcache.h>
#include <linux/quotaops.h>
-#include <linux/rwsem.h>
#include "xattr.h"
#include "acl.h"
@@ -107,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache;
static const struct xattr_handler *ext3_xattr_handler_map[] = {
[EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
- [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_SECURITY
@@ -120,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = {
&ext3_xattr_user_handler,
&ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- &ext3_xattr_acl_access_handler,
- &ext3_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT3_FS_SECURITY
&ext3_xattr_security_handler,
@@ -803,17 +798,25 @@ inserted:
/* We need to allocate a new block */
ext3_fsblk_t goal = ext3_group_first_block_no(sb,
EXT3_I(inode)->i_block_group);
- ext3_fsblk_t block = ext3_new_block(handle, inode,
- goal, &error);
+ ext3_fsblk_t block;
+
+ /*
+ * Protect us agaist concurrent allocations to the
+ * same inode from ext3_..._writepage(). Reservation
+ * code does not expect racing allocations.
+ */
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ block = ext3_new_block(handle, inode, goal, &error);
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
if (error)
goto cleanup;
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
getblk_failed:
ext3_free_blocks(handle, inode, block, 1);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
@@ -925,7 +928,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
/*
* ext3_xattr_set_handle()
*
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode. Value
* is NULL to remove an existing extended attribute, and non-NULL to
* either replace an existing extended attribute, or create a new extended
* attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe720116..32e93ebf803 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -60,8 +60,6 @@ struct ext3_xattr_entry {
extern const struct xattr_handler ext3_xattr_user_handler;
extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern const struct xattr_handler ext3_xattr_acl_default_handler;
extern const struct xattr_handler ext3_xattr_security_handler;
extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
@@ -128,10 +126,10 @@ exit_ext3_xattr(void)
#ifdef CONFIG_EXT3_FS_SECURITY
extern int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir);
+ struct inode *dir, const struct qstr *qstr);
#else
static inline int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir)
+ struct inode *dir, const struct qstr *qstr)
{
return 0;
}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f..722c2bf9645 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,13 +3,8 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
#include <linux/security.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
@@ -48,27 +43,33 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+static int ext3_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
- int err;
- size_t len;
- void *value;
- char *name;
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
- err = security_inode_init_security(inode, dir, &name, &value, &len);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext3_xattr_set_handle(handle, inode,
+ EXT3_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
}
- err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
- name, value, len, 0);
- kfree(name);
- kfree(value);
return err;
}
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext3_initxattrs, handle);
+}
+
const struct xattr_handler ext3_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext3_xattr_security_list,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index dc8edda9ffe..d75727cc67f 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,12 +5,7 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
static size_t
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 7a321974d58..5612af3567e 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,11 +5,7 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
+#include "ext3.h"
#include "xattr.h"
static size_t