23 files changed, 1224 insertions, 600 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 00000000000..4e79074de28
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What:		/sys/fs/ext4/<disk>/mb_stats
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		 Controls whether the multiblock allocator should
+		 collect statistics, which are shown during the unmount.
+		 1 means to collect statistics, 0 means not to collect
+		 statistics
+
+What:		/sys/fs/ext4/<disk>/mb_group_prealloc
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The multiblock allocator will round up allocation
+		requests to a multiple of this tuning parameter if the
+		stripe size is not set in the ext4 superblock
+
+What:		/sys/fs/ext4/<disk>/mb_max_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_min_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The minimum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_order2_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the minimum size for 
+		requests (as a power of 2) where the buddy cache is
+		used
+
+What:		/sys/fs/ext4/<disk>/mb_stream_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Files which have fewer blocks than this tunable
+		parameter will have their blocks allocated out of a
+		block group specific preallocation pool, so that small
+		files are packed closely together.  Each large file
+		 will have its blocks allocated out of its own unique
+		 preallocation pool.
+
+What:		/sys/fs/ext4/<disk>/inode_readahead
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the maximum number of
+		inode table blocks that ext4's inode table readahead
+		algorithm will pre-read into the buffer cache
+
+What:		/sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of blocks
+		that are dirty in the page cache, but which do not
+		have their location in the filesystem allocated yet.
+
+What:		/sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of kilobytes
+		of data that have been written to this filesystem since it was
+		created.
+
+What:		/sys/fs/ext4/<disk>/session_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of
+		kilobytes of data that have been written to this
+		filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc729..97882df0486 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redundancy in tree
 * improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
 * nsec timestamps for mtime, atime, ctime, create time
 * inode version field on disk (NFSv4, Lustre)
 * reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
   the ordering)
 
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
 2.2 Candidate features for future inclusion
 
 * Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec	(*)	Ext4 can be told to sync all its data and metadata
 			performance.
 
 barrier=<0|1(*)>	This enables/disables the use of write barriers in
-			the jbd code.  barrier=0 disables, barrier=1 enables.
-			This also requires an IO stack which can support
+barrier(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
+nobarrier		This also requires an IO stack which can support
 			barriers, and if jbd gets an error on a barrier
 			write, it will disable again with a warning.
 			Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)>	This enables/disables the use of write barriers in
 			safe to use, at some performance penalty.  If
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
+			The mount options "barrier" and "nobarrier" can
+			also be used to enable or disable barriers, for
+			consistency with other ext4 mount options.
 
 inode_readahead=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio	The I/O priority (from 0 to 7, where 0 is the
 			a slightly higher priority than the default I/O
 			priority.
 
+auto_da_alloc(*)	Many broken applications don't use fsync() when 
+noauto_da_alloc		replacing existing files via patterns such as
+			fd = open("foo.new")/write(fd,..)/close(fd)/
+			rename("foo.new", "foo"), or worse yet,
+			fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+			If auto_da_alloc is enabled, ext4 will detect
+			the replace-via-rename and replace-via-truncate
+			patterns and force that any delayed allocation
+			blocks are allocated such that at the next
+			journal commit, in the default data=ordered
+			mode, the data blocks of the new file are forced
+			to disk before the rename() operation is
+			commited.  This provides roughly the same level
+			of guarantees as ext3, and avoids the
+			"zero-length" problem that can happen when a
+			system crashes before the delayed allocation
+			blocks are forced to disk.
+
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0..efc4fd9f40c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
  mb_history      multiblock allocation history
- stats           controls whether the multiblock allocator should start
-                 collecting statistics, which are shown during the unmount
- group_prealloc  the multiblock allocator will round up allocation
-                 requests to a multiple of this tuning parameter if the
-                 stripe size is not set in the ext4 superblock
- max_to_scan     The maximum number of extents the multiblock allocator
-                 will search to find the best extent
- min_to_scan     The minimum number of extents the multiblock allocator
-                 will search to find the best extent
- order2_req      Tuning parameter which controls the minimum size for 
-                 requests (as a power of 2) where the buddy cache is
-                 used
- stream_req      Files which have fewer blocks than this tunable
-                 parameter will have their blocks allocated out of a
-                 block group specific preallocation pool, so that small
-                 files are packed closely together.  Each large file
-                 will have its blocks allocated out of its own unique
-                 preallocation pool.
-inode_readahead  Tuning parameter which controls the maximum number of
-                 inode table blocks that ext4's inode table readahead
-                 algorithm will pre-read into the buffer cache
 ..............................................................................
 
 
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-				ext4_group_t block_group)
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	ext4_fsblk_t tmp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	int used_blocks = sbi->s_itb_per_group + 2;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-		struct ext4_group_desc *gdp;
-		struct buffer_head *bh;
-
-		gdp = ext4_get_group_desc(sb, block_group, &bh);
 		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
 					block_group))
 			used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 	/*
 	 * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 unsigned int offset)
 {
 	const char *error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len)
-						< EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
 					goto revalidate;
 				stored++;
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 990c9400092..d0f15ef56de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
 #undef EXT4FS_DEBUG
 
 /*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS	8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS		1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
  * Debug code
  */
 #ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
-#define EXT4_MULTIBLOCK_ALLOCATOR	1
-
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-	__u32 free_inodes;
-	__u32 free_blocks;
+	atomic_t free_inodes;
+	atomic_t free_blocks;
+	atomic_t used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
 /*
  * Inode dynamic state flags
  */
@@ -256,6 +271,7 @@ struct flex_groups {
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION		0x10000	/* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
 	__u8	s_reserved_char_pad2;
 	__le16  s_reserved_pad;
-	__u32   s_reserved[162];        /* Padding to the end of the block */
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__u32   s_reserved[160];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 
 /*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
+/*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
 					 ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-	if (len == EXT4_MAX_REC_LEN || len == 0)
-		return 1 << 16;
-	return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-	if (len == (1 << 16))
-		return cpu_to_le16(EXT4_MAX_REC_LEN);
-	else if (len > (1 << 16))
-		BUG();
-	return cpu_to_le16(len);
-}
-
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define	EXT4_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_proc,		\
-				&ext4_ui_proc_fops, &sbi->s_##var);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
 /*
  * Function prototypes
  */
@@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
 /*
  * storage for cached extent
  */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
 	/* allocation reservation info for delalloc */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock s_blockgroup_lock;
+	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
-
-	/* root of the per fs reservation window tree */
-	spinlock_t s_rsv_window_lock;
-	struct rb_root s_rsv_window_root;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
 
 	/* Journaling */
 	struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
 
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 };
@@ -153,7 +155,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	int depth;
 
 	if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block 
+		 * group for directories and special files.  Regular 
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves 
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 	return max;
 }
 
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			((block + len) > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = idx_pblock(ext_idx);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			(block > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	struct ext4_extent *ext;
+	struct ext4_extent_idx *ext_idx;
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		ext = EXT_FIRST_EXTENT(eh);
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+			ext++;
+			entries--;
+		}
+	} else {
+		ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
 					struct ext4_extent_header *eh,
 					int depth)
 {
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
 	return 0;
 
 corrupted:
 	ext4_error(inode->i_sb, function,
-			"bad header in inode #%lu: %s - magic %x, "
+			"bad header/extent in inode #%lu: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
 	return -EIO;
 }
 
-#define ext4_ext_check_header(inode, eh, depth)	\
-	__ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth)	\
+	__ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
-	if (ext4_ext_check_header(inode, eh, depth))
-		return ERR_PTR(-EIO);
-
 
 	/* account possible depth increase */
 	if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	i = depth;
 	/* walk through the tree */
 	while (i) {
+		int need_to_validate = 0;
+
 		ext_debug("depth %d: num %d, max %d\n",
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = sb_bread(inode->i_sb, path[ppos].p_block);
-		if (!bh)
+		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+		if (unlikely(!bh))
 			goto err;
-
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto err;
+			}
+			/* validate the extent entries */
+			need_to_validate = 1;
+		}
 		eh = ext_block_hdr(bh);
 		ppos++;
 		BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check_header(inode, eh, i))
+		if (need_to_validate && ext4_ext_check(inode, eh, i))
 			goto err;
 	}
 
@@ -1181,7 +1276,7 @@ got_index:
 			return -EIO;
 		eh = ext_block_hdr(bh);
 		/* subtract from p_depth to get proper eh_depth */
-		if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 			put_bh(bh);
 			return -EIO;
 		}
@@ -1194,7 +1289,7 @@ got_index:
 	if (bh == NULL)
 		return -EIO;
 	eh = ext_block_hdr(bh);
-	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 		put_bh(bh);
 		return -EIO;
 	}
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		return -ENOMEM;
 	}
 	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 				err = -EIO;
 				break;
 			}
-			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+			if (ext4_ext_check(inode, ext_block_hdr(bh),
 							depth - i - 1)) {
 				err = -EIO;
 				break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+		ext4_alloc_da_blocks(inode);
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err, count, cleared;
-	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			if (is_directory) {
 				count = ext4_used_dirs_count(sb, gdp) - 1;
 				ext4_used_dirs_set(sb, gdp, count);
+				if (sbi->s_log_groups_per_flex) {
+					ext4_group_t f;
+
+					f = ext4_flex_group(sbi, block_group);
+					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+				}
+
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
 			if (sbi->s_log_groups_per_flex) {
-				flex_group = ext4_flex_group(sbi, block_group);
-				spin_lock(sb_bgl_lock(sbi, flex_group));
-				sbi->s_flex_groups[flex_group].free_inodes++;
-				spin_unlock(sb_bgl_lock(sbi, flex_group));
+				ext4_group_t f;
+
+				f = ext4_flex_group(sbi, block_group);
+				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
 			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
 	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-	if (flex_group[best_flex].free_inodes &&
+	if (atomic_read(&flex_group[best_flex].free_inodes) &&
 	    flex_freeb_ratio > free_block_ratio)
 		goto found_flexbg;
 
@@ -375,24 +381,24 @@ find_close_to_parent:
 		if (i == parent_fbg_group || i == parent_fbg_group - 1)
 			continue;
 
-		flexbg_free_blocks = flex_group[i].free_blocks;
+		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
 		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 
 		if (flex_freeb_ratio > free_block_ratio &&
-		    flex_group[i].free_inodes) {
+		    (atomic_read(&flex_group[i].free_inodes))) {
 			best_flex = i;
 			goto found_flexbg;
 		}
 
-		if (flex_group[best_flex].free_inodes == 0 ||
-		    (flex_group[i].free_blocks >
-		     flex_group[best_flex].free_blocks &&
-		     flex_group[i].free_inodes))
+		if ((atomic_re