aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/proc.txt21
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c424
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--include/linux/jbd2.h6
23 files changed, 1224 insertions, 600 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 00000000000..4e79074de28
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What: /sys/fs/ext4/<disk>/mb_stats
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Controls whether the multiblock allocator should
+ collect statistics, which are shown during the unmount.
+ 1 means to collect statistics, 0 means not to collect
+ statistics
+
+What: /sys/fs/ext4/<disk>/mb_group_prealloc
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if the
+ stripe size is not set in the ext4 superblock
+
+What: /sys/fs/ext4/<disk>/mb_max_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The maximum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_min_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The minimum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_order2_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the minimum size for
+ requests (as a power of 2) where the buddy cache is
+ used
+
+What: /sys/fs/ext4/<disk>/mb_stream_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out of a
+ block group specific preallocation pool, so that small
+ files are packed closely together. Each large file
+ will have its blocks allocated out of its own unique
+ preallocation pool.
+
+What: /sys/fs/ext4/<disk>/inode_readahead
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the maximum number of
+ inode table blocks that ext4's inode table readahead
+ algorithm will pre-read into the buffer cache
+
+What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of blocks
+ that are dirty in the page cache, but which do not
+ have their location in the filesystem allocated yet.
+
+What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of kilobytes
+ of data that have been written to this filesystem since it was
+ created.
+
+What: /sys/fs/ext4/<disk>/session_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc729..97882df0486 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
* extent format more robust in face of on-disk corruption due to magics,
* internal redundancy in tree
* improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
* nsec timestamps for mtime, atime, ctime, create time
* inode version field on disk (NFSv4, Lustre)
* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
the ordering)
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
2.2 Candidate features for future inclusion
* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
performance.
barrier=<0|1(*)> This enables/disables the use of write barriers in
- the jbd code. barrier=0 disables, barrier=1 enables.
- This also requires an IO stack which can support
+barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier This also requires an IO stack which can support
barriers, and if jbd gets an error on a barrier
write, it will disable again with a warning.
Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
safe to use, at some performance penalty. If
your disks are battery-backed in one way or another,
disabling barriers may safely improve performance.
+ The mount options "barrier" and "nobarrier" can
+ also be used to enable or disable barriers, for
+ consistency with other ext4 mount options.
inode_readahead=n This tuning parameter controls the maximum
number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
a slightly higher priority than the default I/O
priority.
+auto_da_alloc(*) Many broken applications don't use fsync() when
+noauto_da_alloc replacing existing files via patterns such as
+ fd = open("foo.new")/write(fd,..)/close(fd)/
+ rename("foo.new", "foo"), or worse yet,
+ fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+ If auto_da_alloc is enabled, ext4 will detect
+ the replace-via-rename and replace-via-truncate
+ patterns and force that any delayed allocation
+ blocks are allocated such that at the next
+ journal commit, in the default data=ordered
+ mode, the data blocks of the new file are forced
+ to disk before the rename() operation is
+ commited. This provides roughly the same level
+ of guarantees as ext3, and avoids the
+ "zero-length" problem that can happen when a
+ system crashes before the delayed allocation
+ blocks are forced to disk.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0..efc4fd9f40c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
File Content
mb_groups details of multiblock allocator buddy cache of free blocks
mb_history multiblock allocation history
- stats controls whether the multiblock allocator should start
- collecting statistics, which are shown during the unmount
- group_prealloc the multiblock allocator will round up allocation
- requests to a multiple of this tuning parameter if the
- stripe size is not set in the ext4 superblock
- max_to_scan The maximum number of extents the multiblock allocator
- will search to find the best extent
- min_to_scan The minimum number of extents the multiblock allocator
- will search to find the best extent
- order2_req Tuning parameter which controls the minimum size for
- requests (as a power of 2) where the buddy cache is
- used
- stream_req Files which have fewer blocks than this tunable
- parameter will have their blocks allocated out of a
- block group specific preallocation pool, so that small
- files are packed closely together. Each large file
- will have its blocks allocated out of its own unique
- preallocation pool.
-inode_readahead Tuning parameter which controls the maximum number of
- inode table blocks that ext4's inode table readahead
- algorithm will pre-read into the buffer cache
..............................................................................
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
}
static int ext4_group_used_meta_blocks(struct super_block *sb,
- ext4_group_t block_group)
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
ext4_fsblk_t tmp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
int used_blocks = sbi->s_itb_per_group + 2;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
- struct ext4_group_desc *gdp;
- struct buffer_head *bh;
-
- gdp = ext4_get_group_desc(sb, block_group, &bh);
if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
block_group))
used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
}
- return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+ return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
}
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_add(blocks_freed,
+ &sbi->s_flex_groups[flex_group].free_blocks);
}
/*
* request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
unsigned int offset)
{
const char *error_msg = NULL;
- const int rlen = ext4_rec_len_from_disk(de->rec_len);
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (rlen < EXT4_DIR_REC_LEN(1))
error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len)
- < EXT4_DIR_REC_LEN(1))
+ if (ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
break;
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = i;
filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
ret = stored;
goto out;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
@@ -225,7 +228,8 @@ revalidate:
goto revalidate;
stored++;
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = 0;
brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 990c9400092..d0f15ef56de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
#undef EXT4FS_DEBUG
/*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS 8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS 1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
* Debug code
*/
#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
-#define EXT4_MULTIBLOCK_ALLOCATOR 1
-
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
*/
struct flex_groups {
- __u32 free_inodes;
- __u32 free_blocks;
+ atomic_t free_inodes;
+ atomic_t free_blocks;
+ atomic_t used_dirs;
};
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+ EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+ EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT4_REG_FLMASK;
+ else
+ return flags & EXT4_OTHER_FLMASK;
+}
+
/*
* Inode dynamic state flags
*/
@@ -256,6 +271,7 @@ struct flex_groups {
#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
/*
* ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_reserved_char_pad2;
__le16 s_reserved_pad;
- __u32 s_reserved[162]; /* Padding to the end of the block */
+ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
+ __u32 s_reserved[160]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
- unsigned len = le16_to_cpu(dlen);
-
- if (len == EXT4_MAX_REC_LEN || len == 0)
- return 1 << 16;
- return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
- if (len == (1 << 16))
- return cpu_to_le16(EXT4_MAX_REC_LEN);
- else if (len > (1 << 16))
- BUG();
- return cpu_to_le16(len);
-}
-
/*
* Hash Tree Directory indexing
* (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
extern struct proc_dir_entry *ext4_proc_root;
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define EXT4_PROC_HANDLER(name, var) \
-do { \
- proc = proc_create_data(name, mode, sbi->s_proc, \
- &ext4_ui_proc_fops, &sbi->s_##var); \
- if (proc == NULL) { \
- printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
- goto err_out; \
- } \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
/*
* Function prototypes
*/
@@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+
/* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
ext4_lblk_t *, ext4_fsblk_t *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
/*
* storage for cached extent
*/
@@ -125,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyblocks_counter;
- struct blockgroup_lock s_blockgroup_lock;
+ struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
-
- /* root of the per fs reservation window tree */
- spinlock_t s_rsv_window_lock;
- struct rb_root s_rsv_window_root;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
/* Journaling */
struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
static inline spinlock_t *
sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
{
- return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
}
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;
if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}
/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
return max;
}
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+ ext4_fsblk_t block = ext_pblock(ext);
+ int len = ext4_ext_get_actual_len(ext);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ ((block + len) > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+ struct ext4_extent_idx *ext_idx)
+{
+ ext4_fsblk_t block = idx_pblock(ext_idx);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ (block > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth)
+{
+ struct ext4_extent *ext;
+ struct ext4_extent_idx *ext_idx;
+ unsigned short entries;
+ if (eh->eh_entries == 0)
+ return 1;
+
+ entries = le16_to_cpu(eh->eh_entries);
+
+ if (depth == 0) {
+ /* leaf entries */
+ ext = EXT_FIRST_EXTENT(eh);
+ while (entries) {
+ if (!ext4_valid_extent(inode, ext))
+ return 0;
+ ext++;
+ entries--;
+ }
+ } else {
+ ext_idx = EXT_FIRST_INDEX(eh);
+ while (entries) {
+ if (!ext4_valid_extent_idx(inode, ext_idx))
+ return 0;
+ ext_idx++;
+ entries--;
+ }
+ }
+ return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
struct ext4_extent_header *eh,
int depth)
{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ error_msg = "invalid extent entries";
+ goto corrupted;
+ }
return 0;
corrupted:
ext4_error(inode->i_sb, function,
- "bad header in inode #%lu: %s - magic %x, "
+ "bad header/extent in inode #%lu: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
return -EIO;
}
-#define ext4_ext_check_header(inode, eh, depth) \
- __ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth) \
+ __ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
- if (ext4_ext_check_header(inode, eh, depth))
- return ERR_PTR(-EIO);
-
/* account possible depth increase */
if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
i = depth;
/* walk through the tree */
while (i) {
+ int need_to_validate = 0;
+
ext_debug("depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_bread(inode->i_sb, path[ppos].p_block);
- if (!bh)
+ bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+ if (unlikely(!bh))
goto err;
-
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto err;
+ }
+ /* validate the extent entries */
+ need_to_validate = 1;
+ }
eh = ext_block_hdr(bh);
ppos++;
BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_header(inode, eh, i))
+ if (need_to_validate && ext4_ext_check(inode, eh, i))
goto err;
}
@@ -1181,7 +1276,7 @@ got_index:
return -EIO;
eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -1194,7 +1289,7 @@ got_index:
if (bh == NULL)
return -EIO;
eh = ext_block_hdr(bh);
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
return -ENOMEM;
}
path[0].p_hdr = ext_inode_hdr(inode);
- if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
err = -EIO;
break;
}
- if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+ if (ext4_ext_check(inode, ext_block_hdr(bh),
depth - i - 1)) {
err = -EIO;
break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
+ if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ ext4_alloc_da_blocks(inode);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ }
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
- ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+ }
+
}
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_dec(&sbi->s_dirs_counter);
if (sbi->s_log_groups_per_flex) {
- flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_inodes++;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
}
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
sbi->s_log_groups_per_flex;
find_close_to_parent:
- flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
- if (flex_group[best_flex].free_inodes &&
+ if (atomic_read(&flex_group[best_flex].free_inodes) &&
flex_freeb_ratio > free_block_ratio)
goto found_flexbg;
@@ -375,24 +381,24 @@ find_close_to_parent:
if (i == parent_fbg_group || i == parent_fbg_group - 1)
continue;
- flexbg_free_blocks = flex_group[i].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
if (flex_freeb_ratio > free_block_ratio &&
- flex_group[i].free_inodes) {
+ (atomic_read(&flex_group[i].free_inodes))) {
best_flex = i;
goto found_flexbg;
}
- if (flex_group[best_flex].free_inodes == 0 ||
- (flex_group[i].free_blocks >
- flex_group[best_flex].free_blocks &&
- flex_group[i].free_inodes))
+ if ((atomic_re