diff options
Diffstat (limited to 'fs/ext3')
| -rw-r--r-- | fs/ext3/acl.c | 293 | ||||
| -rw-r--r-- | fs/ext3/acl.h | 13 | ||||
| -rw-r--r-- | fs/ext3/balloc.c | 351 | ||||
| -rw-r--r-- | fs/ext3/bitmap.c | 16 | ||||
| -rw-r--r-- | fs/ext3/dir.c | 375 | ||||
| -rw-r--r-- | fs/ext3/ext3.h | 1326 | ||||
| -rw-r--r-- | fs/ext3/ext3_jbd.c | 2 | ||||
| -rw-r--r-- | fs/ext3/file.c | 20 | ||||
| -rw-r--r-- | fs/ext3/fsync.c | 38 | ||||
| -rw-r--r-- | fs/ext3/hash.c | 8 | ||||
| -rw-r--r-- | fs/ext3/ialloc.c | 97 | ||||
| -rw-r--r-- | fs/ext3/inode.c | 510 | ||||
| -rw-r--r-- | fs/ext3/ioctl.c | 87 | ||||
| -rw-r--r-- | fs/ext3/namei.c | 400 | ||||
| -rw-r--r-- | fs/ext3/namei.h | 19 | ||||
| -rw-r--r-- | fs/ext3/resize.c | 84 | ||||
| -rw-r--r-- | fs/ext3/super.c | 392 | ||||
| -rw-r--r-- | fs/ext3/symlink.c | 4 | ||||
| -rw-r--r-- | fs/ext3/xattr.c | 33 | ||||
| -rw-r--r-- | fs/ext3/xattr.h | 6 | ||||
| -rw-r--r-- | fs/ext3/xattr_security.c | 43 | ||||
| -rw-r--r-- | fs/ext3/xattr_trusted.c | 7 | ||||
| -rw-r--r-- | fs/ext3/xattr_user.c | 6 |
23 files changed, 2900 insertions, 1230 deletions
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 8a11fe21218..8bbaf5bcf98 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -4,13 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -54,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext3_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext3_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -97,14 +98,19 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); e = (char *)ext_acl + sizeof(ext3_acl_header); for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext3_acl_entry *entry = (ext3_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext3_acl_entry); + break; case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext3_acl_entry); break; @@ -131,7 +137,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext3_get_acl(struct inode *inode, int type) { int name_index; @@ -139,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -184,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext3_new_inode */ static int -ext3_set_acl(handle_t *handle, struct inode *inode, int type, +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -192,19 +191,14 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) @@ -240,19 +234,20 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext3_check_acl(struct inode *inode, int mask) +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } + handle_t *handle; + int error, retries = 0; - return -EAGAIN; +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = __ext3_set_acl(handle, inode, type, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return error; } /* @@ -264,211 +259,23 @@ ext3_check_acl(struct inode *inode, int mask) int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; + struct posix_acl *default_acl, *acl; + int error; - if (S_ISDIR(inode->i_mode)) { - error = ext3_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } - } - posix_acl_release(clone); + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext3_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl, *clone; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); } -out: - posix_acl_release(clone); - return error; -} - -/* - * Extended attribute handlers - */ -static size_t -ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext3_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); - posix_acl_release(acl); - return error; } - -static int -ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); - return error; -} - -const struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; - -const struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de..ea1c69edab9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,19 +54,14 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); -extern int ext3_acl_chmod (struct inode *); +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_check_acl NULL - -static inline int -ext3_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext3_get_acl NULL +#define ext3_set_acl NULL static inline int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index b3db2264942..158b5d4ce06 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -11,15 +11,9 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include "ext3.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -39,6 +33,21 @@ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/* + * Calculate the block group number and offset, given a block number + */ +static void ext3_get_group_no_and_offset(struct super_block *sb, + ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + if (offsetp) + *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); + if (blockgrpp) + *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); +} + /** * ext3_get_group_desc() -- load group descriptor from disk * @sb: super block @@ -145,6 +154,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -335,6 +345,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -408,7 +419,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); @@ -460,8 +471,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -470,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode) * ext3_free_blocks_sb() -- Free given blocks and update quota * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physical block to free * @count: number of blocks to free * @pdquot_freed_blocks: pointer to quota */ @@ -574,7 +587,7 @@ do_more: BUFFER_TRACE(debug_bh, "Deleted!"); if (!bh2jh(bitmap_bh)->b_committed_data) BUFFER_TRACE(debug_bh, - "No commited data in bitmap"); + "No committed data in bitmap"); BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); __brelse(debug_bh); } @@ -667,14 +680,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1047,7 +1056,7 @@ static int find_next_reservable_window( rsv_window_remove(sb, my_rsv); /* - * Let's book the whole avaliable window for now. We will check the + * Let's book the whole available window for now. We will check the * disk bitmap later and then, if there are free blocks then we adjust * the window size if it's larger than requested. * Otherwise, we will remove this node from the tree next time @@ -1120,6 +1129,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1214,8 +1224,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1419,15 +1432,16 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; @@ -1440,14 +1454,14 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) * * ext3_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait - * for the current or commiting transaction to complete, and then + * for the current or committing transaction to complete, and then * return TRUE. * * if the total number of retries exceed three times, return FALSE. */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1498,10 +1512,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1512,8 +1522,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1527,7 +1539,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } @@ -1616,9 +1628,9 @@ retry_alloc: goto allocated; } /* - * We may end up a bogus ealier ENOSPC error due to + * We may end up a bogus earlier ENOSPC error due to * filesystem is "full" of reservations, but - * there maybe indeed free blocks avaliable on disk + * there maybe indeed free blocks available on disk * In this case, we just forget about the reservations * just do block allocation as without reservations. */ @@ -1715,17 +1727,21 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); if (fatal) goto out; *errp = 0; brelse(bitmap_bh); - dquot_free_block(inode, *count-num); - *count = num; + + if (num < *count) { + dquot_free_block(inode, *count-num); + *count = num; + } + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1794,7 +1810,7 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) brelse(bitmap_bh); printk("ext3_count_free_blocks: stored = "E3FSBLK ", computed = "E3FSBLK", "E3FSBLK"\n", - le32_to_cpu(es->s_free_blocks_count), + (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); return bitmap_count; #else @@ -1885,3 +1901,258 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) return ext3_bg_num_gdb_meta(sb,group); } + +/** + * ext3_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: allocation group to trim + * @start: first group block to examine + * @max: last group block to examine + * @gdp: allocation group description structure + * @minblocks: minimum extent block count + * + * ext3_trim_all_free walks through group's block bitmap searching for free + * blocks. When the free block is found, it tries to allocate this block and + * consequent free block to get the biggest free extent possible, until it + * reaches any used block. Then issue a TRIM command on this extent and free + * the extent in the block bitmap. This is done until whole group is scanned. + */ +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) +{ + handle_t *handle; + ext3_grpblk_t next, free_blocks, bit, freed, count = 0; + ext3_fsblk_t discard_block; + struct ext3_sb_info *sbi; + struct buffer_head *gdp_bh, *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int err = 0, ret = 0; + + /* + * We will update one block bitmap, and one group descriptor + */ + handle = ext3_journal_start_sb(sb, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bitmap_bh = read_block_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto err_out; + + gdp = ext3_get_group_desc(sb, group, &gdp_bh); + if (!gdp) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto err_out; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + sbi = EXT3_SB(sb); + + /* Walk through the whole group */ + while (start <= max) { + start = bitmap_search_next_usable_block(start, bitmap_bh, max); + if (start < 0) + break; + next = start; + + /* + * Allocate contiguous free extents by setting bits in the + * block bitmap + */ + while (next <= max + && claim_block(sb_bgl_lock(sbi, group), + next, bitmap_bh)) { + next++; + } + + /* We did not claim any blocks */ + if (next == start) + continue; + + discard_block = (ext3_fsblk_t)start + + ext3_group_first_block_no(sb, group); + + /* Update counters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, start - next); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + + free_blocks -= next - start; + /* Do not issue a TRIM on extents smaller than minblocks */ + if ((next - start) < minblocks) + goto free_extent; + + trace_ext3_discard_blocks(sb, discard_block, next - start); + /* Send the TRIM command down to the device */ + err = sb_issue_discard(sb, discard_block, next - start, + GFP_NOFS, 0); + count += (next - start); +free_extent: + freed = 0; + + /* + * Clear bits in the bitmap + */ + for (bit = start; bit < next; bit++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), + bit, bitmap_bh->b_data)) { + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + (unsigned long)bit); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + freed++; + } + } + + /* Update couters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, freed); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + + start = next; + if (err < 0) { + if (err != -EOPNOTSUPP) + ext3_warning(sb, __func__, "Discard command " + "returned error %d\n", err); + break; + } + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + cond_resched(); + + /* No more suitable extents */ + if (free_blocks < minblocks) + break; + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + ret = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = ret; + + /* And the group descriptor block */ + BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!err) + err = ret; + + ext3_debug("trimmed %d blocks in the group %d\n", + count, group); + +err_out: + if (err) + count = err; + ext3_journal_stop(handle); + brelse(bitmap_bh); + + return count; +} + +/** + * ext3_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @start: First Byte to trim + * @len: number of Bytes to trim from start + * @minlen: minimum extent length in Bytes + * + * ext3_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext3_trim_all_free function + * is invoked to trim all free space. + */ +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + ext3_grpblk_t last_block, first_block; + unsigned long group, first_group, last_group; + struct ext3_group_desc *gdp; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + uint64_t start, minlen, end, trimmed = 0; + ext3_fsblk_t first_data_blk = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); + ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + smp_rmb(); + + /* Determine first and last group to examine based on start and len */ + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, + &first_group, &first_block); + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, + &last_group, &last_block); + + /* end now represents the last block to discard in this group */ + end = EXT3_BLOCKS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + gdp = ext3_get_group_desc(sb, group, NULL); + if (!gdp) + break; + + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_block is + * already computed earlier by ext3_get_group_no_and_offset() + */ + if (group == last_group) + end = last_block; + + if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { + ret = ext3_trim_all_free(sb, group, first_block, + end, minlen); + if (ret < 0) + break; + trimmed += ret; + } + + /* + * For every group except the first one, we are sure + * that the first block to discard will be block #0. + */ + first_block = 0; + } + + if (ret > 0) + ret = 0; + +out: + range->len = trimmed * sb->s_blocksize; + return ret; +} diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index 6afc39d8025..ef9c643e8e9 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -7,25 +7,13 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/buffer_head.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #ifdef EXT3FS_DEBUG -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; - - if (!map) - return (0); - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); } #endif /* EXT3FS_DEBUG */ diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index e2e72c367cf..17742eed2c1 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,35 +21,14 @@ * */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/rbtree.h> +#include <linux/compat.h> +#include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - +static int ext3_dx_readdir(struct file *, struct dir_context *); static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -60,6 +39,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which could potentially get converted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -69,63 +67,53 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, const char * error_msg = NULL; const int rlen = ext3_rec_len_from_disk(de->rec_len); - if (rlen < EXT3_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; - if (error_msg != NULL) + if (unlikely(error_msg != NULL)) ext3_error (dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, (unsigned long) le32_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; } -static int ext3_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned long offset; - int i, stored; + int i; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; - struct inode *inode = filp->f_path.dentry->d_inode; - int ret = 0; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { - err = ext3_dx_readdir(filp, dirent, filldir); - if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; - } + if (is_dx_dir(inode)) { + err = ext3_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { - unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + while (ctx->pos < inode->i_size) { + unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); struct buffer_head map_bh; struct buffer_head *bh = NULL; @@ -134,12 +122,12 @@ static int ext3_readdir(struct file * filp, if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext3_bread(NULL, inode, blk, 0, &err); } @@ -151,22 +139,21 @@ static int ext3_readdir(struct file * filp, if (!dir_has_error) { ext3_error(sb, __func__, "directory #%lu " "contains a hole at offset %lld", - inode->i_ino, filp->f_pos); + inode->i_ino, ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (offset && file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext3_dir_entry_2 *) (bh->b_data + i); @@ -182,71 +169,124 @@ revalidate: i += ext3_rec_len_from_disk(de->rec_len); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); if (!ext3_check_dir_entry ("ext3_readdir", inode, de, bh, offset)) { - /* On error, skip the f_pos to the + /* On error, skip the to the next block. */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); - ret = stored; - goto out; + break; } offset += ext3_rec_len_from_disk(de->rec_len); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored ++; + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext3_rec_len_from_disk(de->rec_len); + ctx->pos += ext3_rec_len_from_disk(de->rec_len); } offset = 0; brelse (bh); + if (ctx->pos < inode->i_size) + if (!dir_relax(inode)) + return 0; } -out: - return ret; + return 0; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif } /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext3_get_htree_eof(file); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, whence, + htree_max, htree_max); + else + return generic_file_llseek(file, offset, whence); +} /* * This structure holds the nodes of the red-black tree used to store @@ -269,53 +309,28 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname * old = fname; + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; fname = fname->next; - kfree (old); - } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } -} + kfree(old); + } while (fname); + *root = RB_ROOT; +} -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -390,62 +405,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, - filldir_t filldir, struct fname *fname) +static bool call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block * sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { printk("call_filldir: called with null fname?!?\n"); - return 0; + return true; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return false; } fname = fname->next; } - return 0; + return true; } -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext3_htree_create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (ctx->pos == ext3_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -453,7 +460,7 @@ static int ext3_dx_readdir(struct file * filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (!call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -467,17 +474,17 @@ static int ext3_dx_readdir(struct file * filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext3_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext3_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -486,7 +493,7 @@ static int ext3_dx_readdir(struct file * filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (!call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -497,7 +504,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -505,7 +512,7 @@ static int ext3_dx_readdir(struct file * filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -516,3 +523,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .iterate = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h new file mode 100644 index 00000000000..e85ff15a060 --- /dev/null +++ b/fs/ext3/ext3.h @@ -0,0 +1,1326 @@ +/* + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/magic.h> +#include <linux/bug.h> +#include <linux/blockgroup_lock.h> + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3FS_DEBUG to produce debug messages + */ +#undef EXT3FS_DEBUG + +/* + * Define EXT3_RESERVATION to reserve data blocks for expanding files + */ +#define EXT3_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT3_MAX_RESERVE_BLOCKS 1027 +#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT3FS_DEBUG +#define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3_BAD_INO 1 /* Bad blocks inode */ +#define EXT3_ROOT_INO 2 /* Root inode */ +#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3 filesystems */ +#define EXT3_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT3_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3_MIN_BLOCK_SIZE 1024 +#define EXT3_MAX_BLOCK_SIZE 65536 +#define EXT3_MIN_BLOCK_LOG_SIZE 10 +#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) +#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) +#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3_MIN_FRAG_SIZE 1024 +#define EXT3_MAX_FRAG_SIZE 4096 +#define EXT3_MIN_FRAG_LOG_SIZE 10 +#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) +#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext3_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) +#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) +#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) +#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT3_NDIR_BLOCKS 12 +#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS +#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) +#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) +#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3_DIRTY_FL 0x00000100 +#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ + EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ + EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT3_REG_FLMASK; + else + return flags & EXT3_OTHER_FLMASK; +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext3_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ +struct ext3_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + +/* + * ioctl commands + */ +#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT3_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) +#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext3_mount_options { + unsigned long s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext3_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +/* EXT3_MOUNT_OLDALLOC was there */ +#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ + +/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3_MOUNT_##opt +#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS +#endif + +#define ext3_set_bit __set_bit_le +#define ext3_set_bit_atomic ext2_set_bit_atomic +#define ext3_clear_bit __clear_bit_le +#define ext3_clear_bit_atomic ext2_clear_bit_atomic +#define ext3_test_bit test_bit_le +#define ext3_find_next_zero_bit find_next_zero_bit_le + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3_ERRORS_PANIC 3 /* Panic */ +#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ +}; + +/* data type for block offset of block group */ +typedef int ext3_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext3_fsblk_t; + +#define E3FSBLK "%lu" + +struct ext3_reserve_window { + ext3_fsblk_t _rsv_start; /* First byte reserved */ + ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext3_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext3_reserve_window rsv_window; +}; + +struct ext3_block_alloc_info { + /* information about reservation window */ + struct ext3_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext3_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext3_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext3_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * third extended file system inode data in memory + */ +struct ext3_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; +#ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; +#endif + ext3_fsblk_t i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + unsigned long i_state_flags; /* Dynamic state flags for ext3 */ + + /* block reservation info */ + struct ext3_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT3_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * truncate_mutex is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_mutex. + */ + struct mutex truncate_mutex; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + atomic_t i_sync_tid; + atomic_t i_datasync_tid; + + struct inode vfs_inode; +}; + +/* + * third extended-fs super-block data in memory + */ +struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + ext3_fsblk_t s_sb_block; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext3_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext3_inode_info *EXT3_I(struct inode *inode) +{ + return container_of(inode, struct ext3_inode_info, vfs_inode); +} + +static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3_ROOT_INO || + ino == EXT3_JOURNAL_INO || + ino == EXT3_RESIZE_INO || + (ino >= EXT3_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT3_OS_LINUX 0 +#define EXT3_OS_HURD 1 +#define EXT3_OS_MASIX 2 +#define EXT3_OS_FREEBSD 3 +#define EXT3_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV +#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + +#define EXT3_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 + +#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3_DEF_RESUID 0 +#define EXT3_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT3_DEFM_DEBUG 0x0001 +#define EXT3_DEFM_BSDGROUPS 0x0002 +#define EXT3_DEFM_XATTR_USER 0x0004 +#define EXT3_DEFM_ACL 0x0008 +#define EXT3_DEFM_UID16 0x0010 +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT3_NAME_LEN 255 + +struct ext3_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3_FT_UNKNOWN 0 +#define EXT3_FT_REG_FILE 1 +#define EXT3_FT_DIR 2 +#define EXT3_FT_CHRDEV 3 +#define EXT3_FT_BLKDEV 4 +#define EXT3_FT_FIFO 5 +#define EXT3_FT_SOCK 6 +#define EXT3_FT_SYMLINK 7 + +#define EXT3_FT_MAX 8 + +/* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3_DIR_PAD 4 +#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) +#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) +#define EXT3_MAX_REC_LEN ((1<<16)-1) + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext3_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT3_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext3_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT3_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); +#endif + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext3_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3_iloc +{ + struct buffer_head *bh; + unsigned long offset; + unsigned long block_group; +}; + +static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) +{ + return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext3_fsblk_t +ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext3_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); +extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); +extern void ext3_free_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count); +extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); +extern void ext3_check_blocks_bitmap (struct super_block *); +extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext3_init_block_alloc_info(struct inode *); +extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); +extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); + +/* dir.c */ +extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +extern void ext3_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext3fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext3_new_inode (handle_t *, struct inode *, + const struct qstr *, umode_t); +extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext3_count_free_inodes (struct super_block *); +extern unsigned long ext3_count_dirs (struct super_block *); +extern void ext3_check_inodes_bitmap (struct super_block *); +extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + +/* inode.c */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); +struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, + int create); + +extern struct inode *ext3_iget(struct super_block *, unsigned long); +extern int ext3_write_inode (struct inode *, struct writeback_control *); +extern int ext3_setattr (struct dentry *, struct iattr *); +extern void ext3_evict_inode (struct inode *); +extern int ext3_sync_inode (handle_t *, struct inode *); +extern void ext3_discard_reservation (struct inode *); +extern void ext3_dirty_inode(struct inode *, int); +extern int ext3_change_inode_journal_flag(struct inode *, int); +extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); +extern int ext3_can_truncate(struct inode *inode); +extern void ext3_truncate(struct inode *inode); +extern void ext3_set_inode_flags(struct inode *); +extern void ext3_get_inode_flags(struct ext3_inode_info *); +extern void ext3_set_aops(struct inode *inode); +extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +extern int ext3_orphan_add(handle_t *, struct inode *); +extern int ext3_orphan_del(handle_t *, struct inode *); +extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext3_group_add(struct super_block *sb, + struct ext3_new_group_data *input); +extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count); + +/* super.c */ +extern __printf(3, 4) +void ext3_error(struct super_block *, const char *, const char *, ...); +extern void __ext3_std_error (struct super_block *, const char *, int); +extern __printf(3, 4) +void ext3_abort(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_msg(struct super_block *, const char *, const char *, ...); +extern void ext3_update_dynamic_rev (struct super_block *sb); + +#define ext3_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3_std_error((sb), __func__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext3_dir_operations; + +/* file.c */ +extern const struct inode_operations ext3_file_inode_operations; +extern const struct file_operations ext3_file_operations; + +/* namei.c */ +extern const struct inode_operations ext3_dir_inode_operations; +extern const struct inode_operations ext3_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext3_symlink_inode_operations; +extern const struct inode_operations ext3_fast_symlink_inode_operations; + +#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT3_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ + EXT3_XATTR_TRANS_BLOCKS - 2 + \ + EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT3_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT3_RESERVE_TRANS_BLOCKS 12U + +#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) + +int +ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext3_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh); + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__func__, (handle), (bh)) +#define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__func__, (handle), (bh)) +#define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__func__, (handle), (bh)) +#define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__func__, (handle), (bh)) +#define ext3_journal_forget(handle, bh) \ + __ext3_journal_forget(__func__, (handle), (bh)) + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); +int __ext3_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) +{ + return ext3_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext3_journal_stop(handle) \ + __ext3_journal_stop(__func__, (handle)) + +static inline handle_t *ext3_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext3_journal_extend(handle_t *handle, int nblocks) +{ + return journal_extend(handle, nblocks); +} + +static inline int ext3_journal_restart(handle_t *handle, int nblocks) +{ + return journal_restart(handle, nblocks); +} + +static inline int ext3_journal_blocks_per_page(struct inode *inode) +{ + return journal_blocks_per_page(inode); +} + +static inline int ext3_journal_force_commit(journal_t *journal) +{ + return journal_force_commit(journal); +} + +/* super.c */ +int ext3_force_commit(struct super_block *sb); + +static inline int ext3_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext3_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext3_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#include <trace/events/ext3.h> diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c index d401f148d74..785a3261a26 100644 --- a/fs/ext3/ext3_jbd.c +++ b/fs/ext3/ext3_jbd.c @@ -2,7 +2,7 @@ * Interface between ext3 and JBD */ -#include <linux/ext3_jbd.h> +#include "ext3.h" int __ext3_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e61cb..a062fa1e1b1 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -18,12 +18,8 @@ * (jj@sunsite.ms.mff.cuni.cz) */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> #include <linux/quotaops.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -54,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp) const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -67,11 +63,10 @@ const struct file_operations ext3_file_operations = { .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, @@ -79,7 +74,8 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 09b13bb34c9..1cb9c7e10c6 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -22,14 +22,9 @@ * we can depend on generic_block_fdatasync() to sync the data blocks. */ -#include <linux/time.h> #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/sched.h> #include <linux/writeback.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> +#include "ext3.h" /* * akpm: A new design for ext3_sync_file(). @@ -43,7 +38,7 @@ * inode to disk. */ -int ext3_sync_file(struct file *file, int datasync) +int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ext3_inode_info *ei = EXT3_I(inode); @@ -51,8 +46,18 @@ int ext3_sync_file(struct file *file, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; - if (inode->i_sb->s_flags & MS_RDONLY) + trace_ext3_sync_file_enter(file, datasync); + + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated state */ + smp_rmb(); + if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) + return -EROFS; return 0; + } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + goto out; J_ASSERT(ext3_journal_current_handle() == NULL); @@ -70,8 +75,10 @@ int ext3_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext3_should_journal_data(inode)) - return ext3_force_commit(inode->i_sb); + if (ext3_should_journal_data(inode)) { + ret = ext3_force_commit(inode->i_sb); + goto out; + } if (datasync) commit_tid = atomic_read(&ei->i_datasync_tid); @@ -89,7 +96,14 @@ int ext3_sync_file(struct file *file, int datasync) * disk caches manually so that data really is on persistent * storage */ - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (needs_barrier) { + int err; + + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index 7d215b4d4f2..ede315cdf12 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -9,9 +9,7 @@ * License. */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/cryptohash.h> #define DELTA 0x9E3779B9 @@ -200,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 9724aef2246..a1b810230cc 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -12,20 +12,10 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/random.h> -#include <linux/bitops.h> - -#include <asm/byteorder.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -118,6 +108,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -176,42 +167,6 @@ error_return: } /* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - -/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -225,8 +180,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). + * it has too few free blocks left (min_blocks). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -236,21 +190,16 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent) { int parent_group = EXT3_I(parent)->i_block_group; struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; - ext3_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; @@ -266,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; @@ -287,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; - max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; - for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext3_get_group_desc (sb, group, NULL); @@ -404,7 +343,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, umode_t mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -425,6 +365,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -432,12 +373,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; @@ -560,8 +498,12 @@ got: if (IS_DIRSYNC(inode)) handle->h_sync = 1; if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -589,7 +531,7 @@ got: if (err) goto fail_free_drop; - err = ext3_init_security(handle,inode, dir); + err = ext3_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; @@ -600,6 +542,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); @@ -616,7 +559,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index a9580617edd..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,26 +22,18 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h> #include <linux/highuid.h> -#include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/mpage.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -70,6 +62,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +187,52 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; } - truncate_inode_pages(&inode->i_data, 0); + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages_final(&inode->i_data); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -231,15 +256,13 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -251,18 +274,18 @@ void ext3_evict_inode (struct inode *inode) if (ext3_mark_inode_dirty(handle, inode)) { /* If that failed, just dquot_drop() and be done with that */ dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); } else { ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); ext3_free_inode(handle, inode); } ext3_journal_stop(handle); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } @@ -655,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -696,7 +723,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext3_free_blocks(handle, inode, new_blocks[i], 1); ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -725,6 +752,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; block_i = ei->i_block_alloc_info; /* @@ -764,9 +792,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } /* ext3_mark_inode_dirty already updated i_sync_tid */ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); @@ -842,6 +872,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +917,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -934,9 +968,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -970,6 +1001,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1043,16 +1077,15 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, * mapped. 0 in case of a HOLE. */ if (err > 0) { - if (err > 1) - WARN_ON(1); + WARN_ON(err > 1); err = 0; } *errp = err; if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { @@ -1100,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; - if (buffer_uptodate(bh)) + if (bh_uptodate_or_lock(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1202,6 +1237,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1217,6 +1262,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1379,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1415,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1391,10 +1440,12 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1419,8 +1470,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1507,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1568,7 +1581,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1577,6 +1596,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1614,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1642,11 +1659,18 @@ static int ext3_writeback_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1684,18 +1708,25 @@ static int ext3_journalled_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); - - if (ext3_journal_current_handle()) - goto no_write; - - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + trace_ext3_journalled_writepage(page); if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. @@ -1715,18 +1746,21 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1739,6 +1773,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1749,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset, length); + /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1782,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1791,9 +1829,11 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, count, rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1816,19 +1856,17 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext3_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1842,8 +1880,10 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -1868,6 +1908,7 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -1894,7 +1935,6 @@ static const struct address_space_operations ext3_ordered_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_ordered_writepage, - .sync_page = block_sync_page, .write_begin = ext3_write_begin, .write_end = ext3_ordered_write_end, .bmap = ext3_bmap, @@ -1903,6 +1943,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; @@ -1910,7 +1951,6 @@ static const struct address_space_operations ext3_writeback_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_writeback_writepage, - .sync_page = block_sync_page, .write_begin = ext3_write_begin, .write_end = ext3_writeback_write_end, .bmap = ext3_bmap, @@ -1926,7 +1966,6 @@ static const struct address_space_operations ext3_journalled_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_journalled_writepage, - .sync_page = block_sync_page, .write_begin = ext3_write_begin, .write_end = ext3_journalled_write_end, .set_page_dirty = ext3_journalled_set_page_dirty, @@ -1953,17 +1992,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1999,20 +2045,30 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err) goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2026,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2058,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q) * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is refered + * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation @@ -2145,13 +2204,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + if (ext3_journal_dirty_metadata(handle, bh)) + return; } ext3_mark_inode_dirty(handle, inode); truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext3_journal_get_write_access(handle, bh); + if (ext3_journal_get_write_access(handle, bh)) + return; } } @@ -2185,7 +2246,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, * @first: array of block numbers * @last: points immediately past the end of array * - * We are freeing all blocks refered from that array (numbers are stored as + * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these @@ -2273,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, * @last: pointer immediately past the end of array * @depth: depth of the branches to free * - * We are freeing all blocks refered from these branches (numbers are + * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ @@ -2392,8 +2453,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -2410,7 +2469,7 @@ int ext3_can_truncate(struct inode *inode) * transaction, and VFS/VM ensures that ext3_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -2437,7 +2496,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2445,7 +2503,8 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; + + trace_ext3_truncate_enter(inode); if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2453,37 +2512,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2598,6 +2632,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2606,6 +2641,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2659,12 +2695,12 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " "inode=%lu, block="E3FSBLK, inode->i_ino, block); - return -EIO; + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -2713,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -2747,9 +2783,10 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", @@ -2819,6 +2856,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -2835,13 +2874,15 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); @@ -2996,6 +3037,10 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; + uid_t i_uid; + gid_t i_gid; again: /* we can't allow multiple procs in here at once, its a bit racey */ @@ -3008,32 +3053,38 @@ again: ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -3049,8 +3100,11 @@ again: if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -3103,6 +3157,8 @@ again: ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); @@ -3114,21 +3170,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3140,13 +3195,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -3155,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); @@ -3190,8 +3250,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -3217,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) ext3_journal_stop(handle); } + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { handle_t *handle; @@ -3228,25 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); mark_inode_dirty(inode); if (ia_valid & ATTR_MODE) - rc = ext3_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); @@ -3292,7 +3373,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else - ret = 2 * (bpp + indirects) + 2; + ret = 2 * (bpp + indirects) + indirects + 2; #ifdef CONFIG_QUOTA /* We know that structure was already allocated during dquot_initialize so @@ -3358,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) { @@ -3373,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); @@ -3393,7 +3467,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ -void ext3_dirty_inode(struct inode *inode) +void ext3_dirty_inode(struct inode *inode, int flags) { handle_t *current_handle = ext3_journal_current_handle(); handle_t *handle; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 88974814783..4d96e9a6453 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -7,19 +7,14 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/capability.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> #include <linux/mount.h> -#include <linux/time.h> #include <linux/compat.h> #include <asm/uaccess.h> +#include "ext3.h" long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; unsigned short rsv_window_size; @@ -38,13 +33,13 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int oldflags; unsigned int jflag; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -110,7 +105,7 @@ flags_err: err = ext3_change_inode_journal_flag(inode, jflag); flags_out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GETVERSION: @@ -123,10 +118,10 @@ flags_out: __u32 generation; int err; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { @@ -134,10 +129,11 @@ flags_out: goto setversion_out; } + mutex_lock(&inode->i_mutex); handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto setversion_out; + goto unlock_out; } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { @@ -146,34 +142,13 @@ flags_out: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -188,11 +163,11 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; - if (!is_owner_or_cap(inode)) { + if (!inode_owner_or_capable(inode)) { err = -EACCES; goto setrsvsz_out; } @@ -219,7 +194,7 @@ setversion_out: } mutex_unlock(&ei->truncate_mutex); setrsvsz_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_EXTEND: { @@ -230,7 +205,7 @@ setrsvsz_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -245,7 +220,7 @@ setrsvsz_out: if (err == 0) err = err2; group_extend_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_ADD: { @@ -256,7 +231,7 @@ group_extend_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -273,10 +248,32 @@ group_extend_out: if (err == 0) err = err2; group_add_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } + case FITRIM: { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext3_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } default: return -ENOTTY; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index bce9dce639b..f197736dccf 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -24,19 +24,8 @@ * Theodore Ts'o, 2002 */ -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/jbd.h> -#include <linux/time.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/bio.h> - +#include "ext3.h" #include "namei.h" #include "xattr.h" #include "acl.h" @@ -47,7 +36,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, @@ -57,8 +45,7 @@ static struct buffer_head *ext3_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext3_bread(handle, inode, *block, 1, err); - if (bh) { + if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) { inode->i_size += inode->i_sb->s_blocksize; EXT3_I(inode)->i_disksize = inode->i_size; *err = ext3_journal_get_write_access(handle, bh); @@ -287,7 +274,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -350,8 +337,10 @@ dx_probe(struct qstr *entry, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) { + *err = ERR_BAD_DX_DIR; goto fail; + } root = (struct dx_root *) bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && @@ -447,8 +436,10 @@ dx_probe(struct qstr *entry, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) { + *err = ERR_BAD_DX_DIR; goto fail2; + } at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext3_warning(dir->i_sb, __func__, @@ -546,8 +537,8 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash, * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) + if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) return err; /* Failure */ p++; brelse (p->bh); @@ -570,10 +561,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, { struct buffer_head *bh; struct ext3_dir_entry_2 *de, *top; - int err, count = 0; + int err = 0, count = 0; dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); - if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) + + if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err))) return err; de = (struct ext3_dir_entry_2 *) bh->b_data; @@ -584,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); - return count; + /* silently ignore the rest of the block */ + break; } ext3fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -631,7 +620,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; + dir = file_inode(dir_file); if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) @@ -645,7 +634,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); if (!frame) return err; @@ -858,6 +847,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir, struct buffer_head * bh_use[NAMEI_RA_SIZE]; struct buffer_head * bh, *ret = NULL; unsigned long start, block, b; + const u8 *name = entry->name; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead @@ -871,6 +861,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir, namelen = entry->len; if (namelen > EXT3_NAME_LEN) return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == 0)) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } if (is_dx(dir)) { bh = ext3_dx_find_entry(dir, entry, res_dir, &err); /* @@ -909,8 +909,12 @@ restart: num++; bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; - if (bh) - ll_rw_block(READ_META, 1, &bh); + if (bh && !bh_uptodate_or_lock(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -961,55 +965,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, struct qstr *entry, struct ext3_dir_entry_2 **res_dir, int *err) { - struct super_block * sb; + struct super_block *sb = dir->i_sb; struct dx_hash_info hinfo; - u32 hash; struct dx_frame frames[2], *frame; - struct ext3_dir_entry_2 *de, *top; struct buffer_head *bh; unsigned long block; int retval; - int namelen = entry->len; - const u8 *name = entry->name; - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) { - if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ - } - hash = hinfo.hash; + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) + return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err))) goto errout; - de = (struct ext3_dir_entry_2 *) bh->b_data; - top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) { - int off = (block << EXT3_BLOCK_SIZE_BITS(sb)) - + ((char *) de - bh->b_data); - - if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) { - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto errout; - } - if (ext3_match(namelen, name, de)) { - *res_dir = de; - dx_release(frames); - return bh; - } + retval = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { + dx_release(frames); + return bh; } - brelse (bh); + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + /* Check to see if we should continue to search */ - retval = ext3_htree_next_block(dir, hash, frame, + retval = ext3_htree_next_block(dir, hinfo.hash, frame, frames, NULL); if (retval < 0) { ext3_warning(sb, __func__, @@ -1022,12 +1006,12 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } -static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode; struct ext3_dir_entry_2 * de; @@ -1047,15 +1031,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { - if (PTR_ERR(inode) == -ESTALE) { - ext3_error(dir->i_sb, __func__, - "deleted inode referenced: %lu", - ino); - return ERR_PTR(-EIO); - } else { - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ESTALE)) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); } } return d_splice_alias(inode, dentry); @@ -1065,7 +1045,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext3_get_parent(struct dentry *child) { unsigned long ino; - struct qstr dotdot = {.name = "..", .len = 2}; + struct qstr dotdot = QSTR_INIT("..", 2); struct ext3_dir_entry_2 * de; struct buffer_head *bh; @@ -1425,10 +1405,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1469,9 +1458,9 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - bh = ext3_bread(handle, dir, block, 0, &retval); - if(!bh) + if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval))) return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) return retval; @@ -1511,7 +1500,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, entries = frame->entries; at = frame->at; - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err))) goto cleanup; BUFFER_TRACE(bh, "get_write_access"); @@ -1549,8 +1538,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) @@ -1607,7 +1596,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext3_journal_dirty_metadata(handle, frames[0].bh); + err = ext3_journal_dirty_metadata(handle, frames[0].bh); + if (err) + goto journal_error; } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1644,8 +1635,13 @@ static int ext3_delete_entry (handle_t *handle, if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) return -EIO; if (de == de_del) { + int err; + BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + if (pde) pde->rec_len = ext3_rec_len_to_disk( ext3_rec_len_from_disk(pde->rec_len) + @@ -1654,7 +1650,12 @@ static int ext3_delete_entry (handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { +journal_error: + ext3_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext3_rec_len_from_disk(de->rec_len); @@ -1670,8 +1671,8 @@ static int ext3_add_nondir(handle_t *handle, int err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; } drop_nlink(inode); @@ -1688,8 +1689,8 @@ static int ext3_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, + bool excl) { handle_t *handle; struct inode * inode; @@ -1707,7 +1708,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; @@ -1722,7 +1723,7 @@ retry: } static int ext3_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1743,7 +1744,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1758,11 +1759,49 @@ retry: return err; } -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT3_XATTR_TRANS_BLOCKS); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + inode = ext3_new_inode (handle, dir, NULL, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + d_tmpfile(dentry, inode); + err = ext3_orphan_add(handle, inode); + if (err) + goto err_unlock_inode; + mark_inode_dirty(inode); + unlock_new_inode(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_unlock_inode: + ext3_journal_stop(handle); + unlock_new_inode(inode); + return err; +} + +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; struct inode * inode; - struct buffer_head * dir_block; + struct buffer_head * dir_block = NULL; struct ext3_dir_entry_2 * de; int err, retries = 0; @@ -1781,7 +1820,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1789,16 +1828,14 @@ retry: inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - drop_nlink(inode); /* is this nlink == 0? */ - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } + if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err))) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); - ext3_journal_get_write_access(handle, dir_block); + err = ext3_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext3_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1812,14 +1849,19 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); + err = ext3_journal_dirty_metadata(handle, dir_block); + if (err) + goto out_clear_inode; + + err = ext3_mark_inode_dirty(handle, inode); + if (!err) + err = ext3_add_entry (handle, dentry, inode); + if (err) { - inode->i_nlink = 0; +out_clear_inode: + clear_nlink(inode); unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); @@ -1827,10 +1869,14 @@ retry: } inc_nlink(dir); ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); + err = ext3_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_stop: + brelse(dir_block); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -1850,7 +1896,7 @@ static int empty_dir (struct inode * inode) sb = inode->i_sb; if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || - !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) { if (err) ext3_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", @@ -1881,9 +1927,8 @@ static int empty_dir (struct inode * inode) (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { err = 0; brelse (bh); - bh = ext3_bread (NULL, inode, - offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); - if (!bh) { + if (!(bh = ext3_dir_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) { if (err) ext3_error(sb, __func__, "error %d reading directory" @@ -2124,6 +2169,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2151,7 +2197,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) @@ -2169,6 +2215,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } @@ -2178,6 +2225,7 @@ static int ext3_symlink (struct inode * dir, handle_t *handle; struct inode * inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2185,36 +2233,78 @@ static int ext3_symlink (struct inode * dir, dquot_initialize(dir); + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext3_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT3_I(inode)->i_data)) { + if (l > EXT3_N_BLOCKS * 4) { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); /* - * page_symlink() calls into ext3_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + set_nlink(inode, 1); + err = ext3_orphan_del(handle, inode); if (err) { + ext3_journal_stop(handle); drop_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; + goto err_drop_inode; } } else { inode->i_op = &ext3_fast_symlink_inode_operations; @@ -2228,6 +2318,10 @@ out_stop: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext3_link (struct dentry * old_dentry, @@ -2242,16 +2336,9 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS); + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2265,6 +2352,11 @@ retry: err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext3_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -2337,7 +2429,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) @@ -2353,7 +2445,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; } else { BUFFER_TRACE(new_bh, "get write access"); - ext3_journal_get_write_access(handle, new_bh); + retval = ext3_journal_get_write_access(handle, new_bh); + if (retval) + goto journal_error; new_de->inode = cpu_to_le32(old_inode->i_ino); if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) @@ -2362,7 +2456,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, new_bh); + retval = ext3_journal_dirty_metadata(handle, new_bh); + if (retval) + goto journal_error; brelse(new_bh); new_bh = NULL; } @@ -2411,10 +2507,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ext3_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); + retval = ext3_journal_get_write_access(handle, dir_bh); + if (retval) + goto journal_error; PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); + retval = ext3_journal_dirty_metadata(handle, dir_bh); + if (retval) { +journal_error: + ext3_std_error(new_dir->i_sb, retval); + goto end_rename; + } drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -2456,6 +2559,7 @@ const struct inode_operations ext3_dir_inode_operations = { .mkdir = ext3_mkdir, .rmdir = ext3_rmdir, .mknod = ext3_mknod, + .tmpfile = ext3_tmpfile, .rename = ext3_rename, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR @@ -2464,7 +2568,8 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2475,5 +2580,6 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h index f2ce2b0065c..46304d8c9f0 100644 --- a/fs/ext3/namei.h +++ b/fs/ext3/namei.h @@ -6,3 +6,22 @@ */ extern struct dentry *ext3_get_parent(struct dentry *child); + +static inline struct buffer_head *ext3_dir_bread(handle_t *handle, + struct inode *inode, + int block, int create, + int *err) +{ + struct buffer_head *bh; + + bh = ext3_bread(handle, inode, block, create, err); + + if (!bh && !(*err)) { + *err = -EIO; + ext3_error(inode->i_sb, __func__, + "Directory hole detected on inode %lu\n", + inode->i_ino); + return NULL; + } + return bh; +} diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index e746d30b123..27105655502 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -11,10 +11,7 @@ #define EXT3FS_DEBUG -#include <linux/ext3_jbd.h> - -#include <linux/errno.h> -#include <linux/slab.h> +#include "ext3.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) @@ -119,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext3_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -237,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto exit_bh; } if ((err = ext3_journal_get_write_access(handle, gdb))) { @@ -249,7 +246,11 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext3_journal_dirty_metadata(handle, gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -269,7 +270,11 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(gdb); goto exit_bh; } - ext3_journal_dirty_metadata(handle, gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -295,7 +300,11 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext3_journal_dirty_metadata(handle, it); + err = ext3_journal_dirty_metadata(handle, it); + if (err) { + brelse(it); + goto exit_bh; + } brelse(it); ext3_set_bit(bit, bh->b_data); } @@ -306,7 +315,9 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto exit_bh; brelse(bh); /* Mark unused entries in inode bitmap used */ @@ -319,7 +330,7 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); exit_bh: brelse(bh); @@ -503,12 +514,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; - ext3_journal_dirty_metadata(handle, dind); + err = ext3_journal_dirty_metadata(handle, dind); + if (err) + goto exit_group_desc; brelse(dind); + dind = NULL; inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - ext3_mark_iloc_dirty(handle, inode, &iloc); + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto exit_group_desc; memset((*primary)->b_data, 0, sb->s_blocksize); - ext3_journal_dirty_metadata(handle, *primary); + err = ext3_journal_dirty_metadata(handle, *primary); + if (err) + goto exit_group_desc; o_group_desc = EXT3_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -519,10 +537,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto exit_inode; return 0; +exit_group_desc: + kfree(n_group_desc); exit_inode: //ext3_journal_release_buffer(handle, iloc.bh); brelse(iloc.bh); @@ -700,22 +722,26 @@ static void update_backups(struct super_block *sb, break; bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext3_debug("update metadata backup %#04lx\n", (unsigned long)bh->b_blocknr); - if ((err = ext3_journal_get_write_access(handle, bh))) + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); brelse(bh); + if (err) + break; } if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -922,7 +948,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext3_journal_dirty_metadata(handle, primary); + err = ext3_journal_dirty_metadata(handle, primary); + if (err) + goto exit_journal; /* Update the reserved block counts only once the new group is * active. */ @@ -934,7 +962,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT3_INODES_PER_GROUP(sb)); - ext3_journal_dirty_metadata(handle, sbi->s_sbh); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: mutex_unlock(&sbi->s_resize_lock); @@ -978,7 +1006,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK - " upto "E3FSBLK" blocks\n", + " up to "E3FSBLK" blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -1064,8 +1092,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + if (err) { + ext3_warning(sb, __func__, + "error %d on journal dirty metadata", err); + ext3_journal_stop(handle); + goto exit_put; + } ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2fedaf8b501..08cdfe5461e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -17,29 +17,23 @@ */ #include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/slab.h> -#include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> -#include <linux/buffer_head.h> #include <linux/exportfs.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/random.h> #include <linux/mount.h> -#include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/log2.h> +#include <linux/cleancache.h> +#include <linux/namei.h> #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS + +#include "ext3.h" #include "xattr.h" #include "acl.h" #include "namei.h" @@ -71,11 +65,6 @@ static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. */ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) { @@ -97,12 +86,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) return journal_start(journal, nblocks); } -/* - * The only special thing we need to do here is to make sure that all - * journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ int __ext3_journal_stop(const char *where, handle_t *handle) { struct super_block *sb; @@ -144,12 +127,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, void ext3_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT3-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); } @@ -188,6 +175,11 @@ static void ext3_handle_error(struct super_block *sb) if (test_opt (sb, ERRORS_RO)) { ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); @@ -196,15 +188,20 @@ static void ext3_handle_error(struct super_block *sb) sb->s_id); } -void ext3_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); ext3_handle_error(sb); @@ -275,15 +272,20 @@ void __ext3_std_error (struct super_block * sb, const char * function, * case we take the easy way out and panic immediately. */ -void ext3_abort (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_abort(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); if (test_opt(sb, ERRORS_PANIC)) @@ -295,22 +297,32 @@ void ext3_abort (struct super_block * sb, const char * function, ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - sb->s_flags |= MS_RDONLY; set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } -void ext3_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_warning(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", - sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } @@ -347,13 +359,13 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; fail: - ext3_msg(sb, "error: failed to open journal device %s: %ld", + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; @@ -362,23 +374,19 @@ fail: /* * Release the journal device */ -static int ext3_blkdev_put(struct block_device *bdev) +static void ext3_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +static void ext3_blkdev_remove(struct ext3_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext3_blkdev_put(bdev); + ext3_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -480,6 +488,20 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + static void ext3_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT3_I(inode)->i_orphan))) { @@ -490,7 +512,7 @@ static void ext3_destroy_inode(struct inode *inode) false); dump_stack(); } - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); + call_rcu(&inode->i_rcu, ext3_i_callback); } static void init_once(void *foo) @@ -505,7 +527,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", sizeof(struct ext3_inode_info), @@ -519,6 +541,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext3_inode_cachep); } @@ -576,9 +603,9 @@ static char *data_mode_string(unsigned long mode) * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext3_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -593,13 +620,15 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT3_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT3_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -617,8 +646,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -731,7 +758,7 @@ static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path); + struct path *path); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -764,6 +791,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -792,6 +820,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -833,6 +862,7 @@ static const match_table_t tokens = { {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -868,7 +898,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - ext3_msg(sb, "error: invalid sb specification: %s", + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -897,21 +927,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return 0; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); + if (sbi->s_qf_names[qtype]) { + int same = !strcmp(sbi->s_qf_names[qtype], qname); + kfree(qname); - return 0; + if (!same) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + } + return same; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext3_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(qname); return 0; } + sbi->s_qf_names[qtype] = qname; set_opt(sbi->s_mount_opt, QUOTA); return 1; } @@ -926,11 +959,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) { " when quota turned on"); return 0; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ - sbi->s_qf_names[qtype] = NULL; + if (sbi->s_qf_names[qtype]) { + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + } return 1; } #endif @@ -944,6 +976,13 @@ static int parse_options (char *options, struct super_block *sb, substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; + kuid_t uid; + kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + #ifdef CONFIG_QUOTA int qfmt; #endif @@ -959,7 +998,7 @@ static int parse_options (char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -977,12 +1016,23 @@ static int parse_options (char *options, struct super_block *sb, case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return 0; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return 0; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -1013,10 +1063,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: @@ -1085,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb, return 0; *journal_devnum = option; break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; @@ -1344,6 +1431,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, } else { ext3_msg(sb, KERN_INFO, "using internal journal"); } + cleancache_init_fs(sb); return res; } @@ -1441,11 +1529,20 @@ static void ext3_orphan_cleanup (struct super_block * sb, return; } + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } @@ -1617,9 +1714,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) return -ENOMEM; } sb->s_fs_info = sbi; - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; sbi->s_sb_block = sb_block; blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); @@ -1683,9 +1777,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + /* enable barriers by default */ + set_opt(sbi->s_mount_opt, BARRIER); set_opt(sbi->s_mount_opt, RESERVATION); if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, @@ -1842,13 +1938,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - if (generic_check_addressable(sb->s_blocksize_bits, - le32_to_cpu(es->s_blocks_count))) { + err = generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count)); + if (err) { ext3_msg(sb, KERN_ERR, "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) ext3_msg(sb, KERN_ERR, "error: CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -1911,6 +2009,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); @@ -1998,22 +2097,23 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount3; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); - iput(root); ret = -ENOMEM; goto failed_mount3; } - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { + ext3_mark_recovery_complete(sb, es); ext3_msg(sb, KERN_INFO, "recovery complete"); - ext3_mark_recovery_complete(sb, es); + } ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": @@ -2136,13 +2236,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext3_msg(sb, KERN_ERR, - "error: failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -2188,11 +2281,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { - ext3_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; + if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { + if (bh_submit_read(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext3_msg(sb, KERN_ERR, @@ -2291,7 +2384,7 @@ static int ext3_load_journal(struct super_block *sb, EXT3_SB(sb)->s_journal = journal; ext3_clear_journal_err(sb, es); - if (journal_devnum && + if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); @@ -2479,6 +2572,12 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -2529,11 +2628,9 @@ out: static int ext3_unfreeze(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) { - lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - unlock_super(sb); journal_unlock_updates(EXT3_SB(sb)->s_journal); } return 0; @@ -2552,8 +2649,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif + sync_filesystem(sb); + /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_resuid = sbi->s_resuid; @@ -2562,7 +2660,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + int j; + + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif /* @@ -2626,13 +2735,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } @@ -2655,12 +2764,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); #endif - unlock_super(sb); - if (enable_quota) dquot_resume(sb, -1); return 0; @@ -2673,13 +2778,10 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); return err; } @@ -2725,6 +2827,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * bitmap, and an inode table. */ overhead += ngroups * (2 + sbi->s_itb_per_group); + + /* Add the journal blocks as well */ + overhead += sbi->s_journal->j_maxlen; + sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); @@ -2761,7 +2867,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) static inline struct inode *dquot_to_inode(struct dquot *dquot) { - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext3_write_dquot(struct dquot *dquot) @@ -2859,27 +2965,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *name) + struct path *path) { int err; - struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - err = kern_path(name, LOOKUP_FOLLOW, &path); - if (err) - return err; - /* Quotafile not on the same filesystem? */ - if (path.mnt->mnt_sb != sb) { - path_put(&path); + if (path->dentry->d_sb != sb) return -EXDEV; - } /* Journaling quota? */ if (EXT3_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ - if (path.dentry->d_parent != sb->s_root) + if (path->dentry->d_parent != sb->s_root) ext3_msg(sb, KERN_WARNING, "warning: Quota file not on filesystem root. " "Journaled quota will not work."); @@ -2889,7 +2988,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(path.dentry->d_inode)) { + if (ext3_should_journal_data(path->dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -2897,20 +2996,16 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, journal_lock_updates(EXT3_SB(sb)->s_journal); err = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err) { - path_put(&path); + if (err) return err; - } } - err = dquot_quota_on_path(sb, type, format_id, &path); - path_put(&path); - return err; + return dquot_quota_on(sb, type, format_id, path); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and noone else should touch the files) + * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) @@ -2979,7 +3074,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext3_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -3003,10 +3097,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, } brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; @@ -3014,7 +3106,6 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - mutex_unlock(&inode->i_mutex); return len; } @@ -3033,6 +3124,7 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); static int __init init_ext3_fs(void) { diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 7c489820777..6b01c3eab1f 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -17,10 +17,8 @@ * ext3 symlink handling code */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> #include <linux/namei.h> +#include "ext3.h" #include "xattr.h" static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index e69dc6dfaa8..c6874be6d58 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -50,14 +50,9 @@ * by the buffer lock. */ -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/mbcache.h> #include <linux/quotaops.h> -#include <linux/rwsem.h> #include "xattr.h" #include "acl.h" @@ -107,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache; static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_SECURITY @@ -120,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - &ext3_xattr_acl_access_handler, - &ext3_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT3_FS_SECURITY &ext3_xattr_security_handler, @@ -803,17 +798,25 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { getblk_failed: ext3_free_blocks(handle, inode, block, 1); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); @@ -925,7 +928,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext3_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 377fe720116..32e93ebf803 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -60,8 +60,6 @@ struct ext3_xattr_entry { extern const struct xattr_handler ext3_xattr_user_handler; extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_acl_access_handler; -extern const struct xattr_handler ext3_xattr_acl_default_handler; extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); @@ -128,10 +126,10 @@ exit_ext3_xattr(void) #ifdef CONFIG_EXT3_FS_SECURITY extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 03a99bfc59f..722c2bf9645 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -3,13 +3,8 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> #include <linux/security.h> +#include "ext3.h" #include "xattr.h" static size_t @@ -48,27 +43,33 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index dc8edda9ffe..d75727cc67f 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -5,12 +5,7 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> -#include <linux/string.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" static size_t diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 7a321974d58..5612af3567e 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -5,11 +5,7 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" static size_t |
